diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 000000000000..97c040337b80 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,130 @@ +# This file defines module label mappings for the Lucene project. +# Each module is associated with a set of file globs that, when matched, +# will trigger the corresponding label to be applied to pull requests. +# +# This configuration is used by the workflow defined in .github/workflows/label-pull-request.yml. +# If we are adding new labels or refactoring modules, we will need to modify this file globs here to ensure that the correct labels are applied. + +# For more information on how to define globs, visit: https://github.com/actions/labeler + +module:analysis: + - changed-files: + - any-glob-to-any-file: 'lucene/analysis/**' + +module:benchmark: + - changed-files: + - any-glob-to-any-file: 'lucene/benchmark/**' + +module:classification: + - changed-files: + - any-glob-to-any-file: 'lucene/classification/**' + +module:core/codecs: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/codecs/**', 'lucene/core/src/test/org/apache/lucene/codecs/**'] + +module:core/FSTs: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/util/fst/**', 'lucene/core/src/test/org/apache/lucene/util/fst/**'] + +module:core/hnsw: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/util/hnsw/**', 'lucene/core/src/test/org/apache/lucene/util/hnsw/**'] + +module:core/index: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/index/**', 'lucene/core/src/test/org/apache/lucene/index/**'] + +module:core/search: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/search/**', 'lucene/core/src/test/org/apache/lucene/search/**'] + +module:core/store: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/store/**', 'lucene/core/src/test/org/apache/lucene/store/**'] + +module:core/other: + - all: + - changed-files: + - any-glob-to-any-file: ['lucene/core/**'] + - all-globs-to-all-files: + - '!lucene/core/src/java/org/apache/lucene/codecs/**' + - '!lucene/core/src/test/org/apache/lucene/codecs/**' + - '!lucene/core/src/java/org/apache/lucene/util/fst/**' + - '!lucene/core/src/test/org/apache/lucene/util/fst/**' + - '!lucene/core/src/java/org/apache/lucene/util/hnsw/**' + - '!lucene/core/src/test/org/apache/lucene/util/hnsw/**' + - '!lucene/core/src/java/org/apache/lucene/index/**' + - '!lucene/core/src/test/org/apache/lucene/index/**' + - '!lucene/core/src/java/org/apache/lucene/search/**' + - '!lucene/core/src/test/org/apache/lucene/search/**' + - '!lucene/core/src/java/org/apache/lucene/store/**' + - '!lucene/core/src/test/org/apache/lucene/store/**' + +module:demo: + - changed-files: + - any-glob-to-any-file: 'lucene/demo/**' + +module:expressions: + - changed-files: + - any-glob-to-any-file: 'lucene/expressions/**' + +module:facet: + - changed-files: + - any-glob-to-any-file: 'lucene/facet/**' + +module:grouping: + - changed-files: + - any-glob-to-any-file: 'lucene/grouping/**' + +module:highlighter: + - changed-files: + - any-glob-to-any-file: 'lucene/highlighter/**' + +module:join: + - changed-files: + - any-glob-to-any-file: 'lucene/join/**' + +module:luke: + - changed-files: + - any-glob-to-any-file: 'lucene/luke/**' + +module:misc: + - changed-files: + - any-glob-to-any-file: 'lucene/misc/**' + +module:monitor: + - changed-files: + - any-glob-to-any-file: 'lucene/monitor/**' + +module:queries: + - changed-files: + - any-glob-to-any-file: 'lucene/queries/**' + +module:queryparser: + - changed-files: + - any-glob-to-any-file: 'lucene/queryparser/**' + +module:replicator: + - changed-files: + - any-glob-to-any-file: 'lucene/replicator/**' + +module:sandbox: + - changed-files: + - any-glob-to-any-file: 'lucene/sandbox/**' + +module:spatial: + - changed-files: + - any-glob-to-any-file: ['lucene/spatial-extras/**', 'lucene/spatial-test-fixtures/**'] + +module:spatial3d: + - changed-files: + - any-glob-to-any-file: 'lucene/spatial3d/**' + +module:suggest: + - changed-files: + - any-glob-to-any-file: 'lucene/suggest/**' + +module:test-framework: + - changed-files: + - any-glob-to-any-file: 'lucene/test-framework/**' diff --git a/.github/workflows/label-pull-request.yml b/.github/workflows/label-pull-request.yml new file mode 100644 index 000000000000..19932d51c04c --- /dev/null +++ b/.github/workflows/label-pull-request.yml @@ -0,0 +1,23 @@ +# This file defines the workflow for labeling pull requests with module tags based on the changed files in the PR. +# It uses the `actions/labeler` GitHub Action to achieve the same. +# +# The workflow is triggered on the `pull_request_target` event which ensures workflow is only run from the master branch. +# The job `labeler` runs on `ubuntu-latest` and has permissions to read contents and write pull requests. +# +# For more information on the `actions/labeler` GitHub Action, refer to https://github.com/actions/labeler + +name: "Pull Request Labeler" +run-name: Labelling pull request with module tags based on changed files in the PR +on: + - pull_request_target + +jobs: + labeler: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@v5 + with: + sync-labels: true \ No newline at end of file diff --git a/.github/workflows/run-checks-all.yml b/.github/workflows/run-checks-all.yml index 18dd308e9a77..fdf23e4c3460 100644 --- a/.github/workflows/run-checks-all.yml +++ b/.github/workflows/run-checks-all.yml @@ -13,7 +13,7 @@ on: - 'branch_10x' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} # We split the workflow into two parallel jobs for efficiency: # one is running all validation checks without tests, diff --git a/.github/workflows/run-checks-gradle-upgrade.yml b/.github/workflows/run-checks-gradle-upgrade.yml index 07b7210cf4e2..b026ce96bba2 100644 --- a/.github/workflows/run-checks-gradle-upgrade.yml +++ b/.github/workflows/run-checks-gradle-upgrade.yml @@ -20,7 +20,7 @@ on: - 'gradle/wrapper/**' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} jobs: gradleSanityCheck: diff --git a/.github/workflows/run-checks-mod-analysis-common.yml b/.github/workflows/run-checks-mod-analysis-common.yml index a208039a99fa..5f53263b3229 100644 --- a/.github/workflows/run-checks-mod-analysis-common.yml +++ b/.github/workflows/run-checks-mod-analysis-common.yml @@ -20,7 +20,7 @@ on: - 'lucene/analysis/common/**' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} jobs: test: diff --git a/.github/workflows/run-checks-mod-distribution.tests.yml b/.github/workflows/run-checks-mod-distribution.tests.yml index e3af5812c80c..b78db6dd9463 100644 --- a/.github/workflows/run-checks-mod-distribution.tests.yml +++ b/.github/workflows/run-checks-mod-distribution.tests.yml @@ -14,7 +14,7 @@ on: - 'branch_10x' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} jobs: test: diff --git a/README.md b/README.md index c613a16986ea..c2c963ef50e3 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Apache Lucene is a high-performance, full-featured text search engine library written in Java. [![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/) -[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://ge.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root) +[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://develocity.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root) ## Online Documentation diff --git a/build-tools/build-infra/build.gradle b/build-tools/build-infra/build.gradle index 5cb1426cba97..34d71f7509d3 100644 --- a/build-tools/build-infra/build.gradle +++ b/build-tools/build-infra/build.gradle @@ -22,6 +22,7 @@ plugins { } repositories { + mavenLocal() mavenCentral() } diff --git a/dev-tools/scripts/releaseWizard.py b/dev-tools/scripts/releaseWizard.py index d599095619d4..3814ae38a789 100755 --- a/dev-tools/scripts/releaseWizard.py +++ b/dev-tools/scripts/releaseWizard.py @@ -49,6 +49,7 @@ from collections import OrderedDict from datetime import datetime from datetime import timedelta +from datetime import timezone try: import holidays @@ -99,7 +100,7 @@ def expand_jinja(text, vars=None): 'state': state, 'gpg_key' : state.get_gpg_key(), 'gradle_cmd' : 'gradlew.bat' if is_windows() else './gradlew', - 'epoch': unix_time_millis(datetime.utcnow()), + 'epoch': unix_time_millis(datetime.now(tz=timezone.utc)), 'get_next_version': state.get_next_version(), 'current_git_rev': state.get_current_git_rev(), 'keys_downloaded': keys_downloaded(), @@ -199,7 +200,7 @@ def check_prerequisites(todo=None): return True -epoch = datetime.utcfromtimestamp(0) +epoch = datetime.fromtimestamp(timestamp=0, tz=timezone.utc) def unix_time_millis(dt): @@ -279,7 +280,7 @@ def __init__(self, config_path, release_version, script_version): self.latest_version = None self.previous_rcs = {} self.rc_number = 1 - self.start_date = unix_time_millis(datetime.utcnow()) + self.start_date = unix_time_millis(datetime.now(tz=timezone.utc)) self.script_branch = run("git rev-parse --abbrev-ref HEAD").strip() self.mirrored_versions = None try: @@ -741,7 +742,7 @@ def get_vars(self): def set_done(self, is_done): if is_done: - self.state['done_date'] = unix_time_millis(datetime.utcnow()) + self.state['done_date'] = unix_time_millis(datetime.now(tz=timezone.utc)) if self.persist_vars: for k in self.persist_vars: self.state[k] = self.get_vars()[k] @@ -935,7 +936,7 @@ def expand_multiline(cmd_txt, indent=0): def unix_to_datetime(unix_stamp): - return datetime.utcfromtimestamp(unix_stamp / 1000) + return datetime.fromtimestamp(timestamp=unix_stamp / 1000, tz=timezone.utc) def generate_asciidoc(): @@ -949,7 +950,7 @@ def generate_asciidoc(): fh.write("= Lucene Release %s\n\n" % state.release_version) fh.write("(_Generated by releaseWizard.py v%s at %s_)\n\n" - % (getScriptVersion(), datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"))) + % (getScriptVersion(), datetime.now(tz=timezone.utc).strftime("%Y-%m-%d %H:%M UTC"))) fh.write(":numbered:\n\n") fh.write("%s\n\n" % template('help')) for group in state.todo_groups: @@ -1839,9 +1840,9 @@ def create_ical(todo): # pylint: disable=unused-argument return True -today = datetime.utcnow().date() +today = datetime.now(tz=timezone.utc).date() sundays = {(today + timedelta(days=x)): 'Sunday' for x in range(10) if (today + timedelta(days=x)).weekday() == 6} -y = datetime.utcnow().year +y = datetime.now(tz=timezone.utc).year years = [y, y+1] non_working = holidays.CA(years=years) + holidays.US(years=years) + holidays.UK(years=years) \ + holidays.DE(years=years) + holidays.NO(years=years) + holidays.IN(years=years) + holidays.RU(years=years) @@ -1849,7 +1850,7 @@ def create_ical(todo): # pylint: disable=unused-argument def vote_close_72h_date(): # Voting open at least 72 hours according to ASF policy - return datetime.utcnow() + timedelta(hours=73) + return datetime.now(tz=timezone.utc) + timedelta(hours=73) def vote_close_72h_holidays(): diff --git a/gradle/datasets/external-datasets.gradle b/gradle/datasets/external-datasets.gradle index 44fd38117bb3..155382e4f463 100644 --- a/gradle/datasets/external-datasets.gradle +++ b/gradle/datasets/external-datasets.gradle @@ -52,6 +52,9 @@ configure(project(":lucene:benchmark")) { dst = file("${dataDir}/${name}") } + // TODO: dataset gone. https://github.com/apache/lucene/issues/13647 + onlyIf { false } + outputs.file ext.dst src ext.src @@ -73,6 +76,9 @@ configure(project(":lucene:benchmark")) { dst = file("${dataDir}/${name}") } + // TODO: dataset gone. https://github.com/apache/lucene/issues/13647 + onlyIf { false } + outputs.file ext.dst src ext.src @@ -99,6 +105,9 @@ configure(project(":lucene:benchmark")) { outputs.file ext.dst + // TODO: dataset gone. https://github.com/apache/lucene/issues/13647 + onlyIf { false } + src ext.src dest ext.intermediate overwrite false @@ -118,6 +127,9 @@ configure(project(":lucene:benchmark")) { dst = file("${dataDir}/${name}") } + // TODO: dataset gone. https://github.com/apache/lucene/issues/13647 + onlyIf { false } + outputs.dir ext.dst src ext.src diff --git a/gradle/ge.gradle b/gradle/ge.gradle index f6bba24f23f5..c4677859e33a 100644 --- a/gradle/ge.gradle +++ b/gradle/ge.gradle @@ -17,13 +17,13 @@ def isCIBuild = System.getenv().keySet().find { it ==~ /(?i)((JENKINS|HUDSON)(_\w+)?|CI)/ } != null -gradleEnterprise { - server = "https://ge.apache.org" +develocity { + server = "https://develocity.apache.org" + projectId = "lucene" + buildScan { - capture { taskInputFiles = true } uploadInBackground = !isCIBuild - publishAlways() - publishIfAuthenticated() + publishing.onlyIf { it.isAuthenticated() } obfuscation { ipAddresses { addresses -> addresses.collect { address -> "0.0.0.0"} } } @@ -35,7 +35,7 @@ buildCache { enabled = !isCIBuild } - remote(gradleEnterprise.buildCache) { + remote(develocity.buildCache) { enabled = false } } diff --git a/gradle/globals.gradle b/gradle/globals.gradle index bcab6461ea91..25bfddc9bebf 100644 --- a/gradle/globals.gradle +++ b/gradle/globals.gradle @@ -22,6 +22,7 @@ allprojects { // Repositories to fetch dependencies from. repositories { + mavenLocal() mavenCentral() } diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index 14e64647d667..b636162ea96d 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -143,7 +143,7 @@ allprojects { ':lucene:codecs', ":lucene:distribution.tests", ":lucene:test-framework" - ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core') + ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core,com.nvidia.cuvs') // TODO: make this sandbox only def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties") def tempDir = layout.projectDirectory.dir(testsTmpDir.toString()) diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle index 670f8ef2689e..185cd0872a9c 100644 --- a/gradle/testing/randomization.gradle +++ b/gradle/testing/randomization.gradle @@ -76,7 +76,9 @@ allprojects { [propName: 'tests.asserts', value: "true", description: "Enables or disables assertions mode."], [propName: 'tests.infostream', value: false, description: "Enables or disables infostream logs."], [propName: 'tests.leaveTemporary', value: false, description: "Leave temporary directories after tests complete."], - [propName: 'tests.useSecurityManager', value: true, description: "Control security manager in tests.", buildOnly: true], + [propName: 'tests.useSecurityManager', + value: { -> rootProject.ext.runtimeJavaVersion <= JavaVersion.VERSION_23 ? 'true' : 'false' }, + description: "Control security manager in tests.", buildOnly: true], // component randomization [propName: 'tests.codec', value: "random", description: "Sets the codec tests should run with."], [propName: 'tests.directory', value: "random", description: "Sets the Directory implementation tests should run with."], diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index f8e09ba03661..41cb5d60e44e 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -80,6 +80,12 @@ grant { permission java.io.FilePermission "${hunspell.corpora}${/}-", "read"; permission java.io.FilePermission "${hunspell.dictionaries}", "read"; permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read"; + + // TODO: these are just temporary to allow tesing with cuvs-java + permission java.lang.RuntimePermission "getenv.CUVS_JAVA_SO_PATH"; + permission java.io.FilePermission "${/}-", "read"; + // For temporary files to communicate with cuvs + permission java.io.FilePermission "${/}tmp${/}-", "write,delete"; }; // Permissions for jacoco code coverage diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2e574668a273..6eb52ac0b9ba 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -9,6 +9,8 @@ API Changes --------------------- * GITHUB#11023: Removing deprecated parameters from CheckIndex. (Jakub Slowinski) +* GITHUB#14165: TieredMergePolicy's maxMergeAtOnce parameter was removed. (Adrien Grand) + New Features --------------------- * GITHUB#14097: Binary partitioning merge policy over float-valued vector field. (Mike Sokolov) @@ -16,7 +18,7 @@ New Features Improvements --------------------- -* GITHUB#266: TieredMergePolicy's maxMergeAtOnce default value was changed from 10 to 30. (Adrien Grand) +(No changes) Optimizations --------------------- @@ -41,15 +43,37 @@ API Changes * GITHUB#14069: Added DocIdSetIterator#intoBitSet API to let implementations optimize loading doc IDs into a bit set. (Adrien Grand) +* GITHUB#14134: Added Bits#applyMask API to help apply live docs as a mask on a + bit set of matches. (Adrien Grand) + New Features --------------------- -(No changes) + +* GITHUB#14084, GITHUB#13635, GITHUB#13634: Adds new `SeededKnnByteVectorQuery` and `SeededKnnFloatVectorQuery` + queries. These queries allow for the vector search entry points to be initialized via a `seed` query. This follows + the research provided via https://arxiv.org/abs/2307.16779. (Sean MacAvaney, Ben Trent). + Improvements --------------------- * GITHUB#14079: Hunspell Dictionary now supports an option to tolerate REP rule count mismatches. (Robert Muir) +* GITHUB#13984: Add HNSW graph checks and stats to CheckIndex + +* GITHUB#14113: Remove unnecessary ByteArrayDataInput allocations from `Lucene90DocValuesProducer$TermsDict.decompressBlock`. (Ankit Jain) + +* GITHUB#14138: Implement IntersectVisitor#visit(IntsRef) in many of the current implementations and add + BulkAdder#add(IntsRef) method. They should provide better performance due to less virtual method calls and + more efficient bulk processing. (Ignacio Vera) + +* GITHUB#14107, GITHUB#14124, GITHUB#14103: Optimize DirectIOIndexInput; add + individual and bulk data retrieval overloads; avoid double buffering with + slices. (Chris Hegarty) + +* GITHUB#14166: Log(ByteSize|Doc)MergePolicy now allow merging more than + mergeFactor segments together when the merge is below the min merge size. + (Adrien Grand) Optimizations --------------------- @@ -59,13 +83,33 @@ Optimizations * GITHUB#14080: Use the `DocIdSetIterator#loadIntoBitSet` API to speed up dense conjunctions. (Adrien Grand) +* GITHUB#14133: Dense blocks of postings are now encoded as bit sets. + (Adrien Grand) + +# GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova) + Bug Fixes --------------------- -(No changes) + +* GITHUB#14109: prefetch may select the wrong memory segment for + multi-segment slices. (Chris Hegarty) + +* GITHUB#14123: SortingCodecReader NPE when segment has no (points, vectors, etc...) (Mike Sokolov) + +* GITHUB#14126: Avoid overflow in index input slices invariant checks + (Chris Hegarty) Other --------------------- -(No changes) + +* GITHUB#14081: Fix urls describing why NIOFS is not recommended for Windows (Marcel Yeonghyeon Ko) + +* GITHUB#14116 Use CDL to block threads to avoid flaky tests. (Ao Li) + +* GITHUB#14091: Cover all DataType. (Lu Xugang) + +* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j + from 1.7.36 to 2.0.16. (Michael Froh) ======================= Lucene 10.1.0 ======================= diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 1db50b7fdd89..62a30c2444cb 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -17,6 +17,13 @@ # Apache Lucene Migration Guide +## Migration from Lucene 10.x to Lucene 11.0 + +### TieredMergePolicy#setMaxMergeAtOnce removed + +This parameter has no replacement, TieredMergePolicy no longer bounds the +number of segments that may be merged together. + ## Migration from Lucene 9.x to Lucene 10.0 ### DataInput#readVLong() may now read negative vlongs diff --git a/lucene/analysis.tests/src/test/module-info.java b/lucene/analysis.tests/src/test/module-info.java index 3a67c75febb0..d4d8957252b2 100644 --- a/lucene/analysis.tests/src/test/module-info.java +++ b/lucene/analysis.tests/src/test/module-info.java @@ -33,6 +33,7 @@ requires org.apache.lucene.analysis.smartcn; requires org.apache.lucene.analysis.stempel; requires org.apache.lucene.test_framework; + requires org.apache.commons.codec; exports org.apache.lucene.analysis.tests; } diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java index dee4afefc58a..ef7a6fb62452 100644 --- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java +++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java @@ -17,8 +17,8 @@ package org.apache.lucene.analysis.opennlp.tools; -import java.io.IOException; import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSTagFormat; import opennlp.tools.postag.POSTagger; import opennlp.tools.postag.POSTaggerME; @@ -29,8 +29,8 @@ public class NLPPOSTaggerOp { private final POSTagger tagger; - public NLPPOSTaggerOp(POSModel model) throws IOException { - tagger = new POSTaggerME(model); + public NLPPOSTaggerOp(POSModel model) { + tagger = new POSTaggerME(model, POSTagFormat.PENN); } public synchronized String[] getPOSTags(String[] words) { diff --git a/lucene/core/src/generated/checksums/generateForDeltaUtil.json b/lucene/core/src/generated/checksums/generateForDeltaUtil.json index 96f49fa81ff1..85765bbd7cbc 100644 --- a/lucene/core/src/generated/checksums/generateForDeltaUtil.json +++ b/lucene/core/src/generated/checksums/generateForDeltaUtil.json @@ -1,4 +1,4 @@ { - "lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "e0bf6071bcdefaa297e0bb92f79615201777652d", - "lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "d7484ab18da33e5cb73faaf84b4e2bb832b62f9d" + "lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "87e4d19b5284fa39adf2c24328cae2076b6f7bb3", + "lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "165586f801bef4d2f540521e81bc119880038b6c" } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java index 51b47a0a1f6d..ceec3ce3342a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java @@ -37,23 +37,6 @@ public final class ForDeltaUtil { private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2; private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4; - // IDENTITY_PLUS_ONE[i] == i+1 - private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE]; - - static { - for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { - IDENTITY_PLUS_ONE[i] = i + 1; - } - } - - private static void prefixSumOfOnes(int[] arr, int base) { - System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE); - // This loop gets auto-vectorized - for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { - arr[i] += base; - } - } - private static void prefixSum8(int[] arr, int base) { // When the number of bits per value is 4 or less, we can sum up all values in a block without // risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4 @@ -199,43 +182,35 @@ private static void innerPrefixSum16(int[] arr) { private final int[] tmp = new int[BLOCK_SIZE]; /** - * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code - * ints} are expected to be deltas between consecutive values. + * Return the number of bits per value required to store the given array containing strictly + * positive numbers. */ - void encodeDeltas(int[] ints, DataOutput out) throws IOException { - if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings - out.writeByte((byte) 0); - } else { - int or = 0; - for (int l : ints) { - or |= l; - } - assert or != 0; - final int bitsPerValue = PackedInts.bitsRequired(or); - out.writeByte((byte) bitsPerValue); - - final int primitiveSize; - if (bitsPerValue <= 3) { - primitiveSize = 8; - collapse8(ints); - } else if (bitsPerValue <= 10) { - primitiveSize = 16; - collapse16(ints); - } else { - primitiveSize = 32; - } - encode(ints, bitsPerValue, primitiveSize, out, tmp); + int bitsRequired(int[] ints) { + int or = 0; + for (int l : ints) { + or |= l; } + // Deltas should be strictly positive since the delta between consecutive doc IDs is at least 1 + assert or != 0; + return PackedInts.bitsRequired(or); } - /** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */ - void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException { - final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte()); - if (bitsPerValue == 0) { - prefixSumOfOnes(ints, base); + /** + * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code + * ints} are expected to be deltas between consecutive values. + */ + void encodeDeltas(int bitsPerValue, int[] ints, DataOutput out) throws IOException { + final int primitiveSize; + if (bitsPerValue <= 3) { + primitiveSize = 8; + collapse8(ints); + } else if (bitsPerValue <= 10) { + primitiveSize = 16; + collapse16(ints); } else { - decodeAndPrefixSum(bitsPerValue, pdu, base, ints); + primitiveSize = 32; } + encode(ints, bitsPerValue, primitiveSize, out, tmp); } /** Delta-decode 128 integers into {@code ints}. */ @@ -307,6 +282,9 @@ void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int prefixSum32(ints, base); break; default: + if (bitsPerValue < 1 || bitsPerValue > Integer.SIZE) { + throw new IllegalStateException("Illegal number of bits per value: " + bitsPerValue); + } decodeSlow(bitsPerValue, pdu, tmp, ints); prefixSum32(ints, base); break; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java index e228f1090ab8..d83111bb8fec 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java @@ -358,8 +358,17 @@ public final class Lucene101PostingsFormat extends PostingsFormat { static final String PAY_CODEC = "Lucene101PostingsWriterPay"; static final int VERSION_START = 0; - static final int VERSION_CURRENT = VERSION_START; + /** + * Version that started encoding dense blocks as bit sets. Note: the old format is a subset of the + * new format, so Lucene101PostingsReader is able to read the old format without checking the + * version. + */ + static final int VERSION_DENSE_BLOCKS_AS_BITSETS = 1; + + static final int VERSION_CURRENT = VERSION_DENSE_BLOCKS_AS_BITSETS; + + private final int version; private final int minTermBlockSize; private final int maxTermBlockSize; @@ -378,7 +387,16 @@ public Lucene101PostingsFormat() { * Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */ public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + this(minTermBlockSize, maxTermBlockSize, VERSION_CURRENT); + } + + /** Expert constructor that allows setting the version. */ + public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize, int version) { super("Lucene101"); + if (version < VERSION_START || version > VERSION_CURRENT) { + throw new IllegalArgumentException("Version out of range: " + version); + } + this.version = version; Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); this.minTermBlockSize = minTermBlockSize; this.maxTermBlockSize = maxTermBlockSize; @@ -386,7 +404,7 @@ public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state, version); boolean success = false; try { FieldsConsumer ret = diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java index 6cd16bb7cc36..b73e6316a7dd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java @@ -53,7 +53,6 @@ import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; @@ -296,12 +295,37 @@ private static int sumOverRange(int[] arr, int start, int end) { final class BlockPostingsEnum extends ImpactsEnum { + private enum DeltaEncoding { + /** + * Deltas between consecutive docs are stored as packed integers, ie. the block is encoded + * using Frame Of Reference (FOR). + */ + PACKED, + /** + * Deltas between consecutive docs are stored using unary coding, ie. {@code delta-1} zero + * bits followed by a one bit, ie. the block is encoded as an offset plus a bit set. + */ + UNARY + } + private ForDeltaUtil forDeltaUtil; private PForUtil pforUtil; + /* Variables that store the content of a block and the current position within this block */ + /* Shared variables */ + private DeltaEncoding encoding; + private int doc; // doc we last read + + /* Variables when the block is stored as packed deltas (Frame Of Reference) */ private final int[] docBuffer = new int[BLOCK_SIZE]; - private int doc; // doc we last read + /* Variables when the block is stored as a bit set */ + // Since we use a bit set when it's more storage-efficient, the bit set cannot have more than + // BLOCK_SIZE*32 bits, which is the maximum possible storage requirement with FOR. + private final FixedBitSet docBitSet = new FixedBitSet(BLOCK_SIZE * Integer.SIZE); + private int docBitSetBase; + // Reuse docBuffer for cumulative pop counts of the words of the bit set. + private final int[] docCumulativeWordPopCounts = docBuffer; // level 0 skip data private int level0LastDocID; @@ -573,7 +597,41 @@ public int freq() throws IOException { } private void refillFullBlock() throws IOException { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); + int bitsPerValue = docIn.readByte(); + if (bitsPerValue > 0) { + // block is encoded as 128 packed integers that record the delta between doc IDs + forDeltaUtil.decodeAndPrefixSum(bitsPerValue, docInUtil, prevDocID, docBuffer); + encoding = DeltaEncoding.PACKED; + } else { + // block is encoded as a bit set + assert level0LastDocID != NO_MORE_DOCS; + docBitSetBase = prevDocID + 1; + int numLongs; + if (bitsPerValue == 0) { + // 0 is used to record that all 128 docs in the block are consecutive + numLongs = BLOCK_SIZE / Long.SIZE; // 2 + docBitSet.set(0, BLOCK_SIZE); + } else { + numLongs = -bitsPerValue; + docIn.readLongs(docBitSet.getBits(), 0, numLongs); + } + if (needsFreq) { + // Note: we know that BLOCK_SIZE bits are set, so no need to compute the cumulative pop + // count at the last index, it will be BLOCK_SIZE. + // Note: this for loop auto-vectorizes + for (int i = 0; i < numLongs - 1; ++i) { + docCumulativeWordPopCounts[i] = Long.bitCount(docBitSet.getBits()[i]); + } + for (int i = 1; i < numLongs - 1; ++i) { + docCumulativeWordPopCounts[i] += docCumulativeWordPopCounts[i - 1]; + } + docCumulativeWordPopCounts[numLongs - 1] = BLOCK_SIZE; + assert docCumulativeWordPopCounts[numLongs - 2] + + Long.bitCount(docBitSet.getBits()[numLongs - 1]) + == BLOCK_SIZE; + } + encoding = DeltaEncoding.UNARY; + } if (indexHasFreq) { if (needsFreq) { freqFP = docIn.getFilePointer(); @@ -608,6 +666,7 @@ private void refillRemainder() throws IOException { prevDocID = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; posDocBufferUpto = 0; + encoding = DeltaEncoding.PACKED; assert docBuffer[docBufferSize] == NO_MORE_DOCS; } @@ -669,7 +728,7 @@ private void skipLevel1To(int target) throws IOException { } private void doMoveToNextLevel0Block() throws IOException { - assert docBufferUpto == BLOCK_SIZE; + assert doc == level0LastDocID; if (posIn != null) { if (level0PosEndFP >= posIn.getFilePointer()) { posIn.seek(level0PosEndFP); @@ -728,9 +787,10 @@ private void moveToNextLevel0Block() throws IOException { if (needsDocsAndFreqsOnly && docCountLeft >= BLOCK_SIZE) { // Optimize the common path for exhaustive evaluation long level0NumBytes = docIn.readVLong(); - docIn.skipBytes(level0NumBytes); + long level0End = docIn.getFilePointer() + level0NumBytes; + level0LastDocID += readVInt15(docIn); + docIn.seek(level0End); refillFullBlock(); - level0LastDocID = docBuffer[BLOCK_SIZE - 1]; } else { doMoveToNextLevel0Block(); } @@ -828,16 +888,7 @@ private void skipLevel0To(int target) throws IOException { public void advanceShallow(int target) throws IOException { if (target > level0LastDocID) { // advance level 0 skip data doAdvanceShallow(target); - - // If we are on the last doc ID of a block and we are advancing on the doc ID just beyond - // this block, then we decode the block. This may not be necessary, but this helps avoid - // having to check whether we are in a block that is not decoded yet in #nextDoc(). - if (docBufferUpto == BLOCK_SIZE && target == doc + 1) { - refillDocs(); - needsRefilling = false; - } else { - needsRefilling = true; - } + needsRefilling = true; } } @@ -854,11 +905,28 @@ private void doAdvanceShallow(int target) throws IOException { @Override public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { - moveToNextLevel0Block(); + if (doc == level0LastDocID || needsRefilling) { + if (needsRefilling) { + refillDocs(); + needsRefilling = false; + } else { + moveToNextLevel0Block(); + } } - return this.doc = docBuffer[docBufferUpto++]; + switch (encoding) { + case PACKED: + doc = docBuffer[docBufferUpto]; + break; + case UNARY: + int next = docBitSet.nextSetBit(doc - docBitSetBase + 1); + assert next != NO_MORE_DOCS; + doc = docBitSetBase + next; + break; + } + + ++docBufferUpto; + return this.doc; } @Override @@ -871,43 +939,103 @@ public int advance(int target) throws IOException { needsRefilling = false; } - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.doc = docBuffer[next]; - docBufferUpto = next + 1; + switch (encoding) { + case PACKED: + { + int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + this.doc = docBuffer[next]; + docBufferUpto = next + 1; + } + break; + case UNARY: + { + int next = docBitSet.nextSetBit(target - docBitSetBase); + assert next != NO_MORE_DOCS; + this.doc = docBitSetBase + next; + if (needsFreq) { + int wordIndex = next >> 6; + // Take the cumulative pop count for the given word, and subtract bits on the left of + // the current doc. + docBufferUpto = + 1 + + docCumulativeWordPopCounts[wordIndex] + - Long.bitCount(docBitSet.getBits()[wordIndex] >>> next); + } else { + // When only docs needed and block is UNARY encoded, we do not need to maintain + // docBufferUpTo to record the iteration position in the block. + // docBufferUpTo == 0 means the block has not been iterated. + // docBufferUpTo != 0 means the block has been iterated. + docBufferUpto = 1; + } + } + break; + } + return doc; } @Override - public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset) - throws IOException { + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { if (doc >= upTo) { return; } // Handle the current doc separately, it may be on the previous docBuffer. - if (acceptDocs == null || acceptDocs.get(doc)) { - bitSet.set(doc - offset); - } + bitSet.set(doc - offset); for (; ; ) { - if (docBufferUpto == BLOCK_SIZE) { + if (doc == level0LastDocID) { // refill moveToNextLevel0Block(); } - int start = docBufferUpto; - int end = computeBufferEndBoundary(upTo); - if (end != 0) { - bufferIntoBitSet(start, end, acceptDocs, bitSet, offset); - doc = docBuffer[end - 1]; - } - docBufferUpto = end; + switch (encoding) { + case PACKED: + { + int start = docBufferUpto; + int end = computeBufferEndBoundary(upTo); + if (end != 0) { + bufferIntoBitSet(start, end, bitSet, offset); + doc = docBuffer[end - 1]; + } + docBufferUpto = end; + if (end != BLOCK_SIZE) { + // Either the block is a tail block, or the block did not fully match, we're done. + nextDoc(); + assert doc >= upTo; + return; + } + } + break; + case UNARY: + { + int sourceFrom; + if (docBufferUpto == 0) { + // start from beginning + sourceFrom = 0; + } else { + // start after the current doc + sourceFrom = doc - docBitSetBase + 1; + } - if (end != BLOCK_SIZE) { - // Either the block is a tail block, or the block did not fully match, we're done. - nextDoc(); - assert doc >= upTo; - break; + int destFrom = docBitSetBase - offset + sourceFrom; + + assert level0LastDocID != NO_MORE_DOCS; + int sourceTo = Math.min(upTo, level0LastDocID + 1) - docBitSetBase; + + if (sourceTo > sourceFrom) { + FixedBitSet.orRange(docBitSet, sourceFrom, bitSet, destFrom, sourceTo - sourceFrom); + } + if (docBitSetBase + sourceTo <= level0LastDocID) { + // We stopped before the end of the current bit set, which means that we're done. + // Set the current doc before returning. + advance(docBitSetBase + sourceTo); + return; + } + doc = level0LastDocID; + docBufferUpto = BLOCK_SIZE; + } + break; } } } @@ -922,15 +1050,12 @@ private int computeBufferEndBoundary(int upTo) { } } - private void bufferIntoBitSet( - int start, int end, Bits acceptDocs, FixedBitSet bitSet, int offset) throws IOException { - // acceptDocs#get (if backed by FixedBitSet), bitSet#set and `doc - offset` get - // auto-vectorized + private void bufferIntoBitSet(int start, int end, FixedBitSet bitSet, int offset) + throws IOException { + // bitSet#set and `doc - offset` get auto-vectorized for (int i = start; i < end; ++i) { int doc = docBuffer[i]; - if (acceptDocs == null || acceptDocs.get(doc)) { - bitSet.set(doc - offset); - } + bitSet.set(doc - offset); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java index 788a5515f2d1..3d19a69b82d8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java @@ -16,16 +16,16 @@ */ package org.apache.lucene.codecs.lucene101; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.*; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.LEVEL1_MASK; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.META_CODEC; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.POS_CODEC; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.VERSION_CURRENT; import java.io.IOException; +import java.util.Arrays; import java.util.Collection; import java.util.List; import org.apache.lucene.codecs.BlockTermState; @@ -46,6 +46,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; /** Writer for {@link Lucene101PostingsFormat}. */ @@ -53,6 +54,8 @@ public class Lucene101PostingsWriter extends PushPostingsWriterBase { static final IntBlockTermState EMPTY_STATE = new IntBlockTermState(); + private final int version; + IndexOutput metaOut; IndexOutput docOut; IndexOutput posOut; @@ -124,8 +127,22 @@ public class Lucene101PostingsWriter extends PushPostingsWriterBase { */ private final ByteBuffersDataOutput level1Output = ByteBuffersDataOutput.newResettableInstance(); - /** Sole constructor. */ + /** + * Reusable FixedBitSet, for dense blocks that are more efficiently stored by storing them as a + * bit set than as packed deltas. + */ + // Since we use a bit set when it's more storage-efficient, the bit set cannot have more than + // BLOCK_SIZE*32 bits, which is the maximum possible storage requirement with FOR. + private final FixedBitSet spareBitSet = new FixedBitSet(BLOCK_SIZE * Integer.SIZE); + + /** Sole public constructor. */ public Lucene101PostingsWriter(SegmentWriteState state) throws IOException { + this(state, Lucene101PostingsFormat.VERSION_CURRENT); + } + + /** Constructor that takes a version. */ + Lucene101PostingsWriter(SegmentWriteState state, int version) throws IOException { + this.version = version; String metaFileName = IndexFileNames.segmentFileName( state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.META_EXTENSION); @@ -139,9 +156,9 @@ public Lucene101PostingsWriter(SegmentWriteState state) throws IOException { try { docOut = state.directory.createOutput(docFileName, state.context); CodecUtil.writeIndexHeader( - metaOut, META_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + metaOut, META_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); CodecUtil.writeIndexHeader( - docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + docOut, DOC_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); forDeltaUtil = new ForDeltaUtil(); pforUtil = new PForUtil(); if (state.fieldInfos.hasProx()) { @@ -151,7 +168,7 @@ public Lucene101PostingsWriter(SegmentWriteState state) throws IOException { state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.POS_EXTENSION); posOut = state.directory.createOutput(posFileName, state.context); CodecUtil.writeIndexHeader( - posOut, POS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + posOut, POS_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); if (state.fieldInfos.hasPayloads()) { payloadBytes = new byte[128]; @@ -177,7 +194,7 @@ public Lucene101PostingsWriter(SegmentWriteState state) throws IOException { Lucene101PostingsFormat.PAY_EXTENSION); payOut = state.directory.createOutput(payFileName, state.context); CodecUtil.writeIndexHeader( - payOut, PAY_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + payOut, PAY_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); } } else { posDeltaBuffer = null; @@ -207,7 +224,7 @@ public IntBlockTermState newTermState() { @Override public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { CodecUtil.writeIndexHeader( - termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + termsOut, TERMS_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); termsOut.writeVInt(BLOCK_SIZE); } @@ -405,7 +422,40 @@ private void flushDocBlock(boolean finishTerm) throws IOException { } } long numSkipBytes = level0Output.size(); - forDeltaUtil.encodeDeltas(docDeltaBuffer, level0Output); + // Now we need to decide whether to encode block deltas as packed integers (FOR) or unary + // codes (bit set). FOR makes #nextDoc() a bit faster while the bit set approach makes + // #advance() usually faster and #intoBitSet() much faster. In the end, we make the decision + // based on storage requirements, picking the bit set approach whenever it's more + // storage-efficient than the next number of bits per value (which effectively slightly biases + // towards the bit set approach). + int bitsPerValue = forDeltaUtil.bitsRequired(docDeltaBuffer); + int sum = Math.toIntExact(Arrays.stream(docDeltaBuffer).sum()); + int numBitSetLongs = FixedBitSet.bits2words(sum); + int numBitsNextBitsPerValue = Math.min(Integer.SIZE, bitsPerValue + 1) * BLOCK_SIZE; + if (sum == BLOCK_SIZE) { + level0Output.writeByte((byte) 0); + } else if (version < VERSION_DENSE_BLOCKS_AS_BITSETS || numBitsNextBitsPerValue <= sum) { + level0Output.writeByte((byte) bitsPerValue); + forDeltaUtil.encodeDeltas(bitsPerValue, docDeltaBuffer, level0Output); + } else { + // Storing doc deltas is more efficient using unary coding (ie. storing doc IDs as a bit + // set) + spareBitSet.clear(0, numBitSetLongs << 6); + int s = -1; + for (int i : docDeltaBuffer) { + s += i; + spareBitSet.set(s); + } + // We never use the bit set encoding when it requires more than Integer.SIZE=32 bits per + // value. So the bit set cannot have more than BLOCK_SIZE * Integer.SIZE / Long.SIZE = 64 + // longs, which fits on a byte. + assert numBitSetLongs <= BLOCK_SIZE / 2; + level0Output.writeByte((byte) -numBitSetLongs); + for (int i = 0; i < numBitSetLongs; ++i) { + level0Output.writeLong(spareBitSet.getBits()[i]); + } + } + if (writeFreqs) { pforUtil.encode(freqBuffer, level0Output); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py index 3214aa671002..b1b36db096a7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py @@ -63,23 +63,6 @@ private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2; private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4; - // IDENTITY_PLUS_ONE[i] == i+1 - private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE]; - - static { - for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { - IDENTITY_PLUS_ONE[i] = i + 1; - } - } - - private static void prefixSumOfOnes(int[] arr, int base) { - System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE); - // This loop gets auto-vectorized - for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { - arr[i] += base; - } - } - private static void prefixSum8(int[] arr, int base) { // When the number of bits per value is 4 or less, we can sum up all values in a block without // risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4 @@ -224,44 +207,33 @@ private final int[] tmp = new int[BLOCK_SIZE]; + /** Return the number of bits per value required to store the given array containing strictly positive numbers. */ + int bitsRequired(int[] ints) { + int or = 0; + for (int l : ints) { + or |= l; + } + // Deltas should be strictly positive since the delta between consecutive doc IDs is at least 1 + assert or != 0; + return PackedInts.bitsRequired(or); + } + /** * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code * ints} are expected to be deltas between consecutive values. */ - void encodeDeltas(int[] ints, DataOutput out) throws IOException { - if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings - out.writeByte((byte) 0); - } else { - int or = 0; - for (int l : ints) { - or |= l; - } - assert or != 0; - final int bitsPerValue = PackedInts.bitsRequired(or); - out.writeByte((byte) bitsPerValue); - - final int primitiveSize; - if (bitsPerValue <= 3) { - primitiveSize = 8; - collapse8(ints); - } else if (bitsPerValue <= 10) { - primitiveSize = 16; - collapse16(ints); - } else { - primitiveSize = 32; - } - encode(ints, bitsPerValue, primitiveSize, out, tmp); - } - } - - /** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */ - void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException { - final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte()); - if (bitsPerValue == 0) { - prefixSumOfOnes(ints, base); + void encodeDeltas(int bitsPerValue, int[] ints, DataOutput out) throws IOException { + final int primitiveSize; + if (bitsPerValue <= 3) { + primitiveSize = 8; + collapse8(ints); + } else if (bitsPerValue <= 10) { + primitiveSize = 16; + collapse16(ints); } else { - decodeAndPrefixSum(bitsPerValue, pdu, base, ints); + primitiveSize = 32; } + encode(ints, bitsPerValue, primitiveSize, out, tmp); } """ @@ -361,6 +333,9 @@ def writeDecode(bpv, f): f.write(' prefixSum%d(ints, base);\n' %primitive_size) f.write(' break;\n') f.write(' default:\n') + f.write(' if (bitsPerValue < 1 || bitsPerValue > Integer.SIZE) {\n') + f.write(' throw new IllegalStateException("Illegal number of bits per value: " + bitsPerValue);\n') + f.write(' }\n') f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n') f.write(' prefixSum32(ints, base);\n') f.write(' break;\n') diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java index 11e83b3f03c1..80dffb7b9708 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java @@ -1122,10 +1122,9 @@ private class TermsDict extends BaseTermsEnum { final LongValues indexAddresses; final RandomAccessInput indexBytes; final BytesRef term; + final BytesRef blockBuffer; + final ByteArrayDataInput blockInput; long ord = -1; - - BytesRef blockBuffer = null; - ByteArrayDataInput blockInput = null; long currentCompressedBlockStart = -1; long currentCompressedBlockEnd = -1; @@ -1149,6 +1148,7 @@ private class TermsDict extends BaseTermsEnum { // add 7 padding bytes can help decompression run faster. int bufferSize = entry.maxBlockLength + entry.maxTermLength + LZ4_DECOMPRESSOR_PADDING; blockBuffer = new BytesRef(new byte[bufferSize], 0, bufferSize); + blockInput = new ByteArrayDataInput(); } @Override @@ -1324,8 +1324,7 @@ private void decompressBlock() throws IOException { } // Reset the buffer. - blockInput = - new ByteArrayDataInput(blockBuffer.bytes, blockBuffer.offset, blockBuffer.length); + blockInput.reset(blockBuffer.bytes, blockBuffer.offset, blockBuffer.length); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index ce9ee1b79cc0..e219157ab986 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -243,13 +243,14 @@ private HnswGraph reconstructAndWriteGraph( nodesByLevel.add(null); int maxOrd = graph.size(); + int[] scratch = new int[graph.maxConn() * 2]; NodesIterator nodesOnLevel0 = graph.getNodesOnLevel(0); levelNodeOffsets[0] = new int[nodesOnLevel0.size()]; while (nodesOnLevel0.hasNext()) { int node = nodesOnLevel0.nextInt(); NeighborArray neighbors = graph.getNeighbors(0, newToOldMap[node]); long offset = vectorIndex.getFilePointer(); - reconstructAndWriteNeighbours(neighbors, oldToNewMap, maxOrd); + reconstructAndWriteNeighbours(neighbors, oldToNewMap, scratch, maxOrd); levelNodeOffsets[0][node] = Math.toIntExact(vectorIndex.getFilePointer() - offset); } @@ -266,7 +267,7 @@ private HnswGraph reconstructAndWriteGraph( for (int node : newNodes) { NeighborArray neighbors = graph.getNeighbors(level, newToOldMap[node]); long offset = vectorIndex.getFilePointer(); - reconstructAndWriteNeighbours(neighbors, oldToNewMap, maxOrd); + reconstructAndWriteNeighbours(neighbors, oldToNewMap, scratch, maxOrd); levelNodeOffsets[level][nodeOffsetIndex++] = Math.toIntExact(vectorIndex.getFilePointer() - offset); } @@ -313,25 +314,33 @@ public NodesIterator getNodesOnLevel(int level) { }; } - private void reconstructAndWriteNeighbours(NeighborArray neighbors, int[] oldToNewMap, int maxOrd) - throws IOException { + private void reconstructAndWriteNeighbours( + NeighborArray neighbors, int[] oldToNewMap, int[] scratch, int maxOrd) throws IOException { int size = neighbors.size(); - vectorIndex.writeVInt(size); - // Destructively modify; it's ok we are discarding it after this int[] nnodes = neighbors.nodes(); for (int i = 0; i < size; i++) { nnodes[i] = oldToNewMap[nnodes[i]]; } Arrays.sort(nnodes, 0, size); + int actualSize = 0; + if (size > 0) { + scratch[0] = nnodes[0]; + actualSize = 1; + } // Now that we have sorted, do delta encoding to minimize the required bits to store the // information - for (int i = size - 1; i > 0; --i) { + for (int i = 1; i < size; i++) { assert nnodes[i] < maxOrd : "node too large: " + nnodes[i] + ">=" + maxOrd; - nnodes[i] -= nnodes[i - 1]; + if (nnodes[i - 1] == nnodes[i]) { + continue; + } + scratch[actualSize++] = nnodes[i] - nnodes[i - 1]; } - for (int i = 0; i < size; i++) { - vectorIndex.writeVInt(nnodes[i]); + // Write the size after duplicates are removed + vectorIndex.writeVInt(actualSize); + for (int i = 0; i < actualSize; i++) { + vectorIndex.writeVInt(scratch[i]); } } @@ -408,6 +417,7 @@ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException { // write vectors' neighbours on each level into the vectorIndex file int countOnLevel0 = graph.size(); int[][] offsets = new int[graph.numLevels()][]; + int[] scratch = new int[graph.maxConn() * 2]; for (int level = 0; level < graph.numLevels(); level++) { int[] sortedNodes = NodesIterator.getSortedNodes(graph.getNodesOnLevel(level)); offsets[level] = new int[sortedNodes.length]; @@ -417,18 +427,26 @@ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException { int size = neighbors.size(); // Write size in VInt as the neighbors list is typically small long offsetStart = vectorIndex.getFilePointer(); - vectorIndex.writeVInt(size); - // Destructively modify; it's ok we are discarding it after this int[] nnodes = neighbors.nodes(); Arrays.sort(nnodes, 0, size); // Now that we have sorted, do delta encoding to minimize the required bits to store the // information - for (int i = size - 1; i > 0; --i) { + int actualSize = 0; + if (size > 0) { + scratch[0] = nnodes[0]; + actualSize = 1; + } + for (int i = 1; i < size; i++) { assert nnodes[i] < countOnLevel0 : "node too large: " + nnodes[i] + ">=" + countOnLevel0; - nnodes[i] -= nnodes[i - 1]; + if (nnodes[i - 1] == nnodes[i]) { + continue; + } + scratch[actualSize++] = nnodes[i] - nnodes[i - 1]; } - for (int i = 0; i < size; i++) { - vectorIndex.writeVInt(nnodes[i]); + // Write the size after duplicates are removed + vectorIndex.writeVInt(actualSize); + for (int i = 0; i < actualSize; i++) { + vectorIndex.writeVInt(scratch[i]); } offsets[level][nodeOffsetId++] = Math.toIntExact(vectorIndex.getFilePointer() - offsetStart); @@ -452,11 +470,12 @@ private void writeMeta( meta.writeVLong(vectorIndexLength); meta.writeVInt(field.getVectorDimension()); meta.writeInt(count); - meta.writeVInt(M); // write graph nodes on each level if (graph == null) { + meta.writeVInt(M); meta.writeVInt(0); } else { + meta.writeVInt(graph.maxConn()); meta.writeVInt(graph.numLevels()); long valueCount = 0; for (int level = 0; level < graph.numLevels(); level++) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index 1a30b5271cd7..39f3a81983b4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -223,7 +223,7 @@ public FlatFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOExceptio public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { rawVectorDelegate.mergeOneField(fieldInfo, mergeState); // Since we know we will not be searching for additional indexing, we can just write the - // the vectors directly to the new segment. + // vectors directly to the new segment. // No need to use temporary file as we don't have to re-open for reading if (fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32)) { ScalarQuantizer mergedQuantizationState = diff --git a/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java b/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java index 7f5f8cf6290c..4e816ffa6259 100644 --- a/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java @@ -44,6 +44,7 @@ import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.NumericUtils; /** Distance query for {@link LatLonPoint}. */ @@ -233,6 +234,11 @@ public void visit(int docID) { adder.add(docID); } + @Override + public void visit(IntsRef ref) { + adder.add(ref); + } + @Override public void visit(DocIdSetIterator iterator) throws IOException { adder.add(iterator); @@ -269,6 +275,14 @@ public void visit(int docID) { cost[0]--; } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + result.clear(ref.ints[ref.offset + i]); + } + cost[0] = Math.max(0, cost[0] - ref.length); + } + @Override public void visit(DocIdSetIterator iterator) throws IOException { result.andNot(iterator); diff --git a/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java b/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java index c675136ca80d..788ded4909ba 100644 --- a/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java @@ -35,6 +35,7 @@ import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.Weight; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.NumericUtils; final class LongDistanceFeatureQuery extends Query { @@ -405,6 +406,21 @@ public void visit(int docID, byte[] packedValue) { adder.add(docID); } + @Override + public void visit(DocIdSetIterator iterator) throws IOException { + int docID; + while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + visit(docID); + } + } + + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; ++i) { + visit(ref.ints[ref.offset + i]); + } + } + @Override public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { long minDocValue = NumericUtils.sortableBytesToLong(minPackedValue, 0); diff --git a/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java b/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java index f5747c0f8bde..8248441f3cda 100644 --- a/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java @@ -38,6 +38,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil.ByteArrayComparator; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.IntsRef; /** * Query class for searching {@code RangeField} types by a defined {@link Relation}. @@ -401,7 +402,12 @@ public void grow(int count) { } @Override - public void visit(int docID) throws IOException { + public void visit(IntsRef ref) { + adder.add(ref); + } + + @Override + public void visit(int docID) { adder.add(docID); } @@ -411,7 +417,7 @@ public void visit(DocIdSetIterator iterator) throws IOException { } @Override - public void visit(int docID, byte[] leaf) throws IOException { + public void visit(int docID, byte[] leaf) { if (queryType.matches(ranges, leaf, numDims, bytesPerDim, comparator)) { visit(docID); } diff --git a/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java b/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java index 811591d9a1cd..4caf06526869 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java @@ -49,6 +49,7 @@ import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IntsRef; /** * Base query class for all spatial geometries: {@link LatLonShape}, {@link LatLonPoint} and {@link @@ -445,6 +446,11 @@ public void visit(DocIdSetIterator iterator) throws IOException { adder.add(iterator); } + @Override + public void visit(IntsRef ref) { + adder.add(ref); + } + @Override public void visit(int docID, byte[] t) { if (leafPredicate.test(t)) { @@ -489,6 +495,14 @@ public void visit(DocIdSetIterator iterator) throws IOException { cost[0] += iterator.cost(); } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + result.set(ref.ints[ref.offset + i]); + } + cost[0] += ref.length; + } + @Override public void visit(int docID, byte[] t) { if (result.get(docID) == false) { @@ -532,6 +546,14 @@ public void visit(int docID) { cost[0]++; } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + result.set(ref.ints[ref.offset + i]); + } + cost[0] += ref.length; + } + @Override public void visit(DocIdSetIterator iterator) throws IOException { result.or(iterator); @@ -589,6 +611,13 @@ public void visit(DocIdSetIterator iterator) throws IOException { excluded.or(iterator); } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + visit(ref.ints[ref.offset + i]); + } + } + @Override public void visit(int docID, byte[] t) { if (excluded.get(docID) == false) { @@ -643,6 +672,14 @@ public void visit(int docID) { cost[0]--; } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + result.clear(ref.ints[ref.offset + i]); + } + cost[0] = Math.max(0, cost[0] - ref.length); + } + @Override public void visit(DocIdSetIterator iterator) throws IOException { result.andNot(iterator); @@ -693,6 +730,13 @@ public void visit(DocIdSetIterator iterator) throws IOException { result.andNot(iterator); } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + visit(ref.ints[ref.offset + i]); + } + } + @Override public void visit(int docID, byte[] packedTriangle) { // NO-OP diff --git a/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java b/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java index 47b6abb46c22..833d9c9209c6 100644 --- a/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java @@ -38,6 +38,7 @@ import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.Weight; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.IntsRef; /** * Finds all previously indexed points that fall within the specified XY geometries. @@ -90,6 +91,11 @@ public void visit(DocIdSetIterator iterator) throws IOException { adder.add(iterator); } + @Override + public void visit(IntsRef ref) { + adder.add(ref); + } + @Override public void visit(int docID, byte[] packedValue) { double x = XYEncodingUtils.decode(packedValue, 0); diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index d957af01d0a2..b3a5e4dc5d11 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -26,8 +26,11 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.text.NumberFormat; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.Deque; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -52,12 +55,14 @@ import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; +import org.apache.lucene.codecs.hnsw.HnswGraphProvider; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.DocumentStoredFieldVisitor; import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.internal.hppc.IntIntHashMap; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.KnnCollector; @@ -74,6 +79,7 @@ import org.apache.lucene.store.Lock; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil.ByteArrayComparator; +import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -91,6 +97,7 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.hnsw.HnswGraph; /** * Basic tool and API to check the health of an index and write a new segments file that removes @@ -249,6 +256,9 @@ public static class SegmentInfoStatus { /** Status of vectors */ public VectorValuesStatus vectorValuesStatus; + /** Status of HNSW graph */ + public HnswGraphsStatus hnswGraphsStatus; + /** Status of soft deletes */ public SoftDeletesStatus softDeletesStatus; @@ -406,6 +416,32 @@ public static final class VectorValuesStatus { public Throwable error; } + /** Status from testing a single HNSW graph */ + public static final class HnswGraphStatus { + + HnswGraphStatus() {} + + /** Number of nodes at each level */ + public List numNodesAtLevel; + + /** Connectedness at each level represented as a fraction */ + public List connectednessAtLevel; + } + + /** Status from testing all HNSW graphs */ + public static final class HnswGraphsStatus { + + HnswGraphsStatus() { + this.hnswGraphsStatusByField = new HashMap<>(); + } + + /** Status of the HNSW graph keyed with field name */ + public Map hnswGraphsStatusByField; + + /** Exception thrown during term index test (null on success) */ + public Throwable error; + } + /** Status from testing index sort */ public static final class IndexSortStatus { IndexSortStatus() {} @@ -1085,6 +1121,9 @@ private Status.SegmentInfoStatus testSegment( // Test FloatVectorValues and ByteVectorValues segInfoStat.vectorValuesStatus = testVectors(reader, infoStream, failFast); + // Test HNSW graph + segInfoStat.hnswGraphsStatus = testHnswGraphs(reader, infoStream, failFast); + // Test Index Sort if (indexSort != null) { segInfoStat.indexSortStatus = testSort(reader, indexSort, infoStream, failFast); @@ -2746,6 +2785,196 @@ public static Status.VectorValuesStatus testVectors( return status; } + /** Test the HNSW graph. */ + public static Status.HnswGraphsStatus testHnswGraphs( + CodecReader reader, PrintStream infoStream, boolean failFast) throws IOException { + if (infoStream != null) { + infoStream.print(" test: hnsw graphs........."); + } + long startNS = System.nanoTime(); + Status.HnswGraphsStatus status = new Status.HnswGraphsStatus(); + KnnVectorsReader vectorsReader = reader.getVectorReader(); + FieldInfos fieldInfos = reader.getFieldInfos(); + + try { + if (fieldInfos.hasVectorValues()) { + for (FieldInfo fieldInfo : fieldInfos) { + if (fieldInfo.hasVectorValues()) { + KnnVectorsReader fieldReader = getFieldReaderForName(vectorsReader, fieldInfo.name); + if (fieldReader instanceof HnswGraphProvider graphProvider) { + HnswGraph hnswGraph = graphProvider.getGraph(fieldInfo.name); + testHnswGraph(hnswGraph, fieldInfo.name, status); + } + } + } + } + msg( + infoStream, + String.format( + Locale.ROOT, + "OK [%d fields] [took %.3f sec]", + status.hnswGraphsStatusByField.size(), + nsToSec(System.nanoTime() - startNS))); + printHnswInfo(infoStream, status.hnswGraphsStatusByField); + } catch (Exception e) { + if (failFast) { + throw IOUtils.rethrowAlways(e); + } + msg(infoStream, "ERROR: " + e); + status.error = e; + if (infoStream != null) { + e.printStackTrace(infoStream); + } + } + + return status; + } + + private static KnnVectorsReader getFieldReaderForName( + KnnVectorsReader vectorsReader, String fieldName) { + if (vectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader fieldsReader) { + return fieldsReader.getFieldReader(fieldName); + } else { + return vectorsReader; + } + } + + private static void printHnswInfo( + PrintStream infoStream, Map fieldsStatus) { + for (Map.Entry entry : fieldsStatus.entrySet()) { + String fieldName = entry.getKey(); + CheckIndex.Status.HnswGraphStatus status = entry.getValue(); + msg(infoStream, " hnsw field name: " + fieldName); + + int numLevels = Math.min(status.numNodesAtLevel.size(), status.connectednessAtLevel.size()); + for (int level = numLevels - 1; level >= 0; level--) { + int numNodes = status.numNodesAtLevel.get(level); + String connectedness = status.connectednessAtLevel.get(level); + msg( + infoStream, + String.format( + Locale.ROOT, + " level %d: %d nodes, %s connected", + level, + numNodes, + connectedness)); + } + } + } + + private static void testHnswGraph( + HnswGraph hnswGraph, String fieldName, Status.HnswGraphsStatus status) + throws IOException, CheckIndexException { + if (hnswGraph != null) { + status.hnswGraphsStatusByField.put(fieldName, new Status.HnswGraphStatus()); + final int numLevels = hnswGraph.numLevels(); + status.hnswGraphsStatusByField.get(fieldName).numNodesAtLevel = + new ArrayList<>(Collections.nCopies(numLevels, null)); + status.hnswGraphsStatusByField.get(fieldName).connectednessAtLevel = + new ArrayList<>(Collections.nCopies(numLevels, null)); + // Perform checks on each level of the HNSW graph + for (int level = numLevels - 1; level >= 0; level--) { + // Collect BitSet of all nodes on this level + BitSet nodesOnThisLevel = new FixedBitSet(hnswGraph.size()); + HnswGraph.NodesIterator nodesIterator = hnswGraph.getNodesOnLevel(level); + while (nodesIterator.hasNext()) { + nodesOnThisLevel.set(nodesIterator.nextInt()); + } + + nodesIterator = hnswGraph.getNodesOnLevel(level); + // Perform checks on each node on the level + while (nodesIterator.hasNext()) { + int node = nodesIterator.nextInt(); + if (node < 0 || node > hnswGraph.size() - 1) { + throw new CheckIndexException( + "Field \"" + + fieldName + + "\" has node: " + + node + + " not in the expected range [0, " + + (hnswGraph.size() - 1) + + "]"); + } + + // Perform checks on the node's neighbors + hnswGraph.seek(level, node); + int nbr, lastNeighbor = -1, firstNeighbor = -1; + while ((nbr = hnswGraph.nextNeighbor()) != NO_MORE_DOCS) { + if (!nodesOnThisLevel.get(nbr)) { + throw new CheckIndexException( + "Field \"" + + fieldName + + "\" has node: " + + node + + " with a neighbor " + + nbr + + " which is not on its level (" + + level + + ")"); + } + if (firstNeighbor == -1) { + firstNeighbor = nbr; + } + if (nbr < lastNeighbor) { + throw new CheckIndexException( + "Field \"" + + fieldName + + "\" has neighbors out of order for node " + + node + + ": " + + nbr + + "<" + + lastNeighbor + + " 1st=" + + firstNeighbor); + } else if (nbr == lastNeighbor) { + throw new CheckIndexException( + "Field \"" + + fieldName + + "\" has repeated neighbors of node " + + node + + " with value " + + nbr); + } + lastNeighbor = nbr; + } + } + int numNodesOnLayer = nodesIterator.size(); + status.hnswGraphsStatusByField.get(fieldName).numNodesAtLevel.set(level, numNodesOnLayer); + + // Evaluate connectedness at this level by measuring the number of nodes reachable from the + // entry point + IntIntHashMap connectedNodes = getConnectedNodesOnLevel(hnswGraph, numNodesOnLayer, level); + status + .hnswGraphsStatusByField + .get(fieldName) + .connectednessAtLevel + .set(level, connectedNodes.size() + "/" + numNodesOnLayer); + } + } + } + + private static IntIntHashMap getConnectedNodesOnLevel( + HnswGraph hnswGraph, int numNodesOnLayer, int level) throws IOException { + IntIntHashMap connectedNodes = new IntIntHashMap(numNodesOnLayer); + int entryPoint = hnswGraph.entryNode(); + Deque stack = new ArrayDeque<>(); + stack.push(entryPoint); + while (!stack.isEmpty()) { + int node = stack.pop(); + if (connectedNodes.containsKey(node)) { + continue; + } + connectedNodes.put(node, 1); + hnswGraph.seek(level, node); + int friendOrd; + while ((friendOrd = hnswGraph.nextNeighbor()) != NO_MORE_DOCS) { + stack.push(friendOrd); + } + } + return connectedNodes; + } + private static boolean vectorsReaderSupportsSearch(CodecReader codecReader, String fieldName) { KnnVectorsReader vectorsReader = codecReader.getVectorReader(); if (vectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader perFieldReader) { diff --git a/lucene/core/src/java/org/apache/lucene/index/CompositeReaderContext.java b/lucene/core/src/java/org/apache/lucene/index/CompositeReaderContext.java index f64a460325bc..0004df794e5c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CompositeReaderContext.java +++ b/lucene/core/src/java/org/apache/lucene/index/CompositeReaderContext.java @@ -32,8 +32,8 @@ static CompositeReaderContext create(CompositeReader reader) { } /** - * Creates a {@link CompositeReaderContext} for intermediate readers that aren't not top-level - * readers in the current context + * Creates a {@link CompositeReaderContext} for intermediate readers that aren't top-level readers + * in the current context */ CompositeReaderContext( CompositeReaderContext parent, diff --git a/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java index d50ae3a85cba..8f56ae49d3e6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java @@ -124,7 +124,7 @@ public static DirectoryReader open(final IndexCommit commit) throws IOException /** * Expert: returns an IndexReader reading the index on the given {@link IndexCommit}. This method - * allows to open indices that were created wih a Lucene version older than N-1 provided that all + * allows to open indices that were created with a Lucene version older than N-1 provided that all * codecs for this index are available in the classpath and the segment file format used was * created with Lucene 7 or newer. Users of this API must be aware that Lucene doesn't guarantee * semantic compatibility for indices created with versions older than N-1. All backwards @@ -150,8 +150,7 @@ public static DirectoryReader open( /** * If the index has changed since the provided reader was opened, open and return a new reader; * else, return null. The new reader, if not null, will be the same type of reader as the previous - * one, ie an NRT reader will open a new NRT reader, a MultiReader will open a new MultiReader, - * etc. + * one, ie an NRT reader will open a new NRT reader etc. * *

This method is typically far less costly than opening a fully new DirectoryReader * as it shares resources (for example sub-readers) with the provided @@ -192,7 +191,7 @@ public static DirectoryReader openIfChanged(DirectoryReader oldReader, IndexComm * never returns null). * *

This provides "near real-time" searching, in that changes made during an {@link IndexWriter} - * session can be quickly made available for searching without closing the writer nor calling + * session can be quickly made available for searching without closing the writer or calling * {@link IndexWriter#commit}. * *

It's near real-time because there is no hard guarantee on how quickly you can get a @@ -305,7 +304,8 @@ public static List listCommits(Directory dir) throws IOException { /** * Returns true if an index likely exists at the specified directory. Note that if a - * corrupt index exists, or if an index in the process of committing + * corrupt index exists, or if an index in the process of committing the return value is not + * reliable. * * @param directory the directory to check for an index * @return true if an index exists; false otherwise diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java index 96da8625c444..ffcb9f07c9b1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java @@ -20,13 +20,10 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.BitSet; -import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntroSorter; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PagedMutable; @@ -480,107 +477,4 @@ final boolean hasValue() { return hasValue; } } - - abstract static class SingleValueDocValuesFieldUpdates extends DocValuesFieldUpdates { - private final BitSet bitSet; - private BitSet hasNoValue; - private boolean hasAtLeastOneValue; - - protected SingleValueDocValuesFieldUpdates( - int maxDoc, long delGen, String field, DocValuesType type) { - super(maxDoc, delGen, field, type); - this.bitSet = new SparseFixedBitSet(maxDoc); - } - - @Override - void add(int doc, long value) { - assert longValue() == value; - bitSet.set(doc); - this.hasAtLeastOneValue = true; - if (hasNoValue != null) { - hasNoValue.clear(doc); - } - } - - @Override - void add(int doc, BytesRef value) { - assert binaryValue().equals(value); - bitSet.set(doc); - this.hasAtLeastOneValue = true; - if (hasNoValue != null) { - hasNoValue.clear(doc); - } - } - - @Override - synchronized void reset(int doc) { - bitSet.set(doc); - this.hasAtLeastOneValue = true; - if (hasNoValue == null) { - hasNoValue = new SparseFixedBitSet(maxDoc); - } - hasNoValue.set(doc); - } - - @Override - void add(int docId, Iterator iterator) { - throw new UnsupportedOperationException(); - } - - protected abstract BytesRef binaryValue(); - - protected abstract long longValue(); - - @Override - synchronized boolean any() { - return super.any() || hasAtLeastOneValue; - } - - @Override - public long ramBytesUsed() { - return super.ramBytesUsed() - + bitSet.ramBytesUsed() - + (hasNoValue == null ? 0 : hasNoValue.ramBytesUsed()); - } - - @Override - Iterator iterator() { - BitSetIterator iterator = new BitSetIterator(bitSet, maxDoc); - return new DocValuesFieldUpdates.Iterator() { - - @Override - public int docID() { - return iterator.docID(); - } - - @Override - public int nextDoc() { - return iterator.nextDoc(); - } - - @Override - long longValue() { - return SingleValueDocValuesFieldUpdates.this.longValue(); - } - - @Override - BytesRef binaryValue() { - return SingleValueDocValuesFieldUpdates.this.binaryValue(); - } - - @Override - long delGen() { - return delGen; - } - - @Override - boolean hasValue() { - if (hasNoValue != null) { - return hasNoValue.get(docID()) == false; - } - return true; - } - }; - } - } } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java index e32c8b20c047..71797257ee70 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -486,7 +486,7 @@ private void doFlush(DocumentsWriterPerThread flushingDWPT) throws IOException { * flush 'B' starts and freezes all deletes occurred since 'A' has * started. if 'B' finishes before 'A' we need to wait until 'A' is done * otherwise the deletes frozen by 'B' are not applied to 'A' and we - * might miss to deletes documents in 'A'. + * might miss to delete documents in 'A'. */ try { assert assertTicketQueueModification(flushingDWPT.deleteQueue); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java index 170966e8ae49..ed3b9d0698d1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java @@ -216,7 +216,7 @@ && delta < ramBufferGranularity()) { // we need to commit this under lock but calculate it outside of the lock to minimize the time // this lock is held // per document. The reason we update this under lock is that we mark DWPTs as pending without - // acquiring it's + // acquiring its // lock in #setFlushPending and this also reads the committed bytes and modifies the // flush/activeBytes. // In the future we can clean this up to be more intuitive. diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index fd6ed22bd4dd..13fe8e62fe17 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -736,7 +736,7 @@ long getLastCommittedBytesUsed() { } /** - * Commits the current {@link #ramBytesUsed()} and stores it's value for later reuse. The last + * Commits the current {@link #ramBytesUsed()} and stores its value for later reuse. The last * committed bytes used can be retrieved via {@link #getLastCommittedBytesUsed()} */ void commitLastBytesUsed(long delta) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexReaderContext.java b/lucene/core/src/java/org/apache/lucene/index/IndexReaderContext.java index 167900575aff..3c79a5e9c7cb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexReaderContext.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexReaderContext.java @@ -19,7 +19,7 @@ import java.util.List; /** - * A struct like class that represents a hierarchical relationship between {@link IndexReader} + * A struct-like class that represents a hierarchical relationship between {@link IndexReader} * instances. */ public abstract sealed class IndexReaderContext permits CompositeReaderContext, LeafReaderContext { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index ad11476e7345..7c907b31b5dc 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -2979,7 +2979,7 @@ private List acquireWriteLocks(Directory... dirs) throws IOException { * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @throws IllegalArgumentException if addIndexes would cause the index to exceed {@link - * #MAX_DOCS}, or if the indoming index sort does not match this index's index sort + * #MAX_DOCS}, or if the incoming index sort does not match this index's index sort */ public long addIndexes(Directory... dirs) throws IOException { ensureOpen(); @@ -6029,7 +6029,7 @@ private void processEvents(boolean triggerMerge) throws IOException { /** * Interface for internal atomic events. See {@link DocumentsWriter} for details. Events are * executed concurrently and no order is guaranteed. Each event should only rely on the - * serializeability within its process method. All actions that must happen before or after a + * serializability within its process method. All actions that must happen before or after a * certain action must be encoded inside the {@link #process(IndexWriter)} method. */ @FunctionalInterface diff --git a/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java index 881ae099d5f8..b6dc9848c9df 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java @@ -636,6 +636,31 @@ public MergeSpecification findMerges( mergeDocs += segmentDocs; } + if (end - start >= mergeFactor + && minMergeSize < maxMergeSize + && mergeSize < minMergeSize + && anyMerging == false) { + // If the merge has mergeFactor segments but is still smaller than the min merged segment + // size, keep packing candidate segments. + while (end < 1 + upto) { + final SegmentInfoAndLevel segLevel = levels.get(end); + final SegmentCommitInfo info = segLevel.info; + if (mergingSegments.contains(info)) { + anyMerging = true; + break; + } + long segmentSize = size(info, mergeContext); + long segmentDocs = sizeDocs(info, mergeContext); + if (mergeSize + segmentSize > minMergeSize || mergeDocs + segmentDocs > maxMergeDocs) { + break; + } + + mergeSize += segmentSize; + mergeDocs += segmentDocs; + end++; + } + } + if (anyMerging || end - start <= 1) { // skip: there is an ongoing merge at the current level or the computed merge has a single // segment and this merge policy doesn't do singleton merges diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java index d66f5648c03d..cbea98daf58f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java @@ -756,7 +756,7 @@ public boolean useCompoundFile( /** * Return the byte size of the provided {@link SegmentCommitInfo}, prorated by percentage of - * non-deleted documents is set. + * non-deleted documents. */ protected long size(SegmentCommitInfo info, MergeContext mergeContext) throws IOException { long byteSize = info.sizeInBytes(); @@ -838,7 +838,7 @@ public void setMaxCFSSegmentSizeMB(double v) { } /** - * Returns true if the segment represented by the given CodecReader should be keep even if it's + * Returns true if the segment represented by the given CodecReader should be kept even if it's * fully deleted. This is useful for testing of for instance if the merge policy implements * retention policies for soft deletes. */ diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesFieldUpdates.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesFieldUpdates.java index d58a12c88253..a3c14486fbda 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesFieldUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesFieldUpdates.java @@ -17,8 +17,11 @@ package org.apache.lucene.index; import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.packed.AbstractPagedMutable; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PagedGrowableWriter; @@ -130,23 +133,104 @@ public long ramBytesUsed() { + RamUsageEstimator.NUM_BYTES_OBJECT_REF; } - static class SingleValueNumericDocValuesFieldUpdates extends SingleValueDocValuesFieldUpdates { + static class SingleValueNumericDocValuesFieldUpdates extends DocValuesFieldUpdates { private final long value; + private final BitSet bitSet; + private BitSet hasNoValue; + private boolean hasAtLeastOneValue; SingleValueNumericDocValuesFieldUpdates(long delGen, String field, int maxDoc, long value) { super(maxDoc, delGen, field, DocValuesType.NUMERIC); + this.bitSet = new SparseFixedBitSet(maxDoc); this.value = value; } + // pkg private for testing + long longValue() { + return value; + } + @Override - protected BytesRef binaryValue() { + void add(int doc, long value) { + assert this.value == value; + bitSet.set(doc); + this.hasAtLeastOneValue = true; + if (hasNoValue != null) { + hasNoValue.clear(doc); + } + } + + @Override + void add(int doc, BytesRef value) { throw new UnsupportedOperationException(); } @Override - protected long longValue() { - return value; + synchronized void reset(int doc) { + bitSet.set(doc); + this.hasAtLeastOneValue = true; + if (hasNoValue == null) { + hasNoValue = new SparseFixedBitSet(maxDoc); + } + hasNoValue.set(doc); + } + + @Override + void add(int docId, Iterator iterator) { + throw new UnsupportedOperationException(); + } + + @Override + synchronized boolean any() { + return super.any() || hasAtLeastOneValue; + } + + @Override + public long ramBytesUsed() { + return super.ramBytesUsed() + + bitSet.ramBytesUsed() + + (hasNoValue == null ? 0 : hasNoValue.ramBytesUsed()); + } + + @Override + Iterator iterator() { + BitSetIterator iterator = new BitSetIterator(bitSet, maxDoc); + return new DocValuesFieldUpdates.Iterator() { + + @Override + public int docID() { + return iterator.docID(); + } + + @Override + public int nextDoc() { + return iterator.nextDoc(); + } + + @Override + long longValue() { + return value; + } + + @Override + BytesRef binaryValue() { + throw new UnsupportedOperationException(); + } + + @Override + long delGen() { + return delGen; + } + + @Override + boolean hasValue() { + if (hasNoValue != null) { + return hasNoValue.get(docID()) == false; + } + return true; + } + }; } } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java index 1bac886c7b4c..b7ca6634efbf 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java @@ -273,7 +273,7 @@ boolean writeDocValuesUpdatesForMerge(List infos) throws IOEx } /** - * Returns a list of all currently maintained ReadersAndUpdates sorted by it's ram consumption + * Returns a list of all currently maintained ReadersAndUpdates sorted by their ram consumption * largest to smallest. This list can also contain readers that don't consume any ram at this * point i.e. don't have any updates buffered. */ diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java index 979d4a7712f5..12f48a1d98f8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java @@ -48,7 +48,7 @@ public final class SegmentReader extends CodecReader { private final SegmentCommitInfo si; // this is the original SI that IW uses internally but it's mutated behind the scenes - // and we don't want this SI to be used for anything. Yet, IW needs this to do maintainance + // and we don't want this SI to be used for anything. Yet, IW needs this to do maintenance // and lookup pooled readers etc. private final SegmentCommitInfo originalSi; private final LeafMetaData metaData; diff --git a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java index 1515c8469c17..bbe493d88d31 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java @@ -32,9 +32,9 @@ import org.apache.lucene.util.FixedBitSet; /** - * This reader filters out documents that have a doc values value in the given field and treat these - * documents as soft deleted. Hard deleted documents will also be filtered out in the life docs of - * this reader. + * This reader filters out documents that have a doc-values value in the given field and treats + * these documents as soft-deleted. Hard deleted documents will also be filtered out in the live + * docs of this reader. * * @see IndexWriterConfig#setSoftDeletesField(String) * @see IndexWriter#softUpdateDocument(Term, Iterable, Field...) @@ -68,7 +68,7 @@ private SoftDeletesDirectoryReaderWrapper(DirectoryReader in, SoftDeletesSubRead protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException { Map readerCache = new HashMap<>(); for (LeafReader reader : getSequentialSubReaders()) { - // we try to reuse the life docs instances here if the reader cache key didn't change + // we try to reuse the live docs instances here if the reader cache key didn't change if (reader instanceof SoftDeletesFilterLeafReader && reader.getReaderCacheHelper() != null) { readerCache.put( ((SoftDeletesFilterLeafReader) reader).reader.getReaderCacheHelper().getKey(), reader); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index daec0c197d6a..ab9964026ad8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -314,6 +314,7 @@ private static class SortingFloatVectorValues extends FloatVectorValues { SortingFloatVectorValues(FloatVectorValues delegate, Sorter.DocMap sortMap) throws IOException { this.delegate = delegate; + assert delegate != null; // SortingValuesIterator consumes the iterator and records the docs and ord mapping iteratorSupplier = iteratorSupplier(delegate, sortMap); } @@ -446,6 +447,9 @@ private SortingCodecReader( @Override public FieldsProducer getPostingsReader() { FieldsProducer postingsReader = in.getPostingsReader(); + if (postingsReader == null) { + return null; + } return new FieldsProducer() { @Override public void close() throws IOException { @@ -481,6 +485,9 @@ public int size() { @Override public StoredFieldsReader getFieldsReader() { StoredFieldsReader delegate = in.getFieldsReader(); + if (delegate == null) { + return null; + } return newStoredFieldsReader(delegate); } @@ -526,6 +533,9 @@ public Bits getLiveDocs() { @Override public PointsReader getPointsReader() { final PointsReader delegate = in.getPointsReader(); + if (delegate == null) { + return null; + } return new PointsReader() { @Override public void checkIntegrity() throws IOException { @@ -551,6 +561,9 @@ public void close() throws IOException { @Override public KnnVectorsReader getVectorReader() { KnnVectorsReader delegate = in.getVectorReader(); + if (delegate == null) { + return null; + } return new KnnVectorsReader() { @Override public void checkIntegrity() throws IOException { @@ -587,6 +600,9 @@ public void close() throws IOException { @Override public NormsProducer getNormsReader() { final NormsProducer delegate = in.getNormsReader(); + if (delegate == null) { + return null; + } return new NormsProducer() { @Override public NumericDocValues getNorms(FieldInfo field) throws IOException { @@ -609,6 +625,9 @@ public void close() throws IOException { @Override public DocValuesProducer getDocValuesReader() { final DocValuesProducer delegate = in.getDocValuesReader(); + if (delegate == null) { + return null; + } return new DocValuesProducer() { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { @@ -710,6 +729,9 @@ public TermVectorsReader getTermVectorsReader() { } private TermVectorsReader newTermVectorsReader(TermVectorsReader delegate) { + if (delegate == null) { + return null; + } return new TermVectorsReader() { @Override public void prefetch(int doc) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java b/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java index 2457f392d112..95dbf11b7209 100644 --- a/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java +++ b/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java @@ -63,7 +63,7 @@ public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {} /** Process a string field. */ public void stringField(FieldInfo fieldInfo, String value) throws IOException {} - /** Process a int numeric field. */ + /** Process an int numeric field. */ public void intField(FieldInfo fieldInfo, int value) throws IOException {} /** Process a long numeric field. */ diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java index b487012cc7d9..70036ce9acb6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java @@ -30,9 +30,7 @@ /** * Merges segments of approximately equal size, subject to an allowed number of segments per tier. * This is similar to {@link LogByteSizeMergePolicy}, except this merge policy is able to merge - * non-adjacent segment, and separates how many segments are merged at once ({@link - * #setMaxMergeAtOnce}) from how many segments are allowed per tier ({@link #setSegmentsPerTier}). - * This merge policy also does not over-merge (i.e. cascade merges). + * non-adjacent segment. This merge policy also does not over-merge (i.e. cascade merges). * *

For normal merging, this policy first computes a "budget" of how many segments are allowed to * be in the index. If the index is over-budget, then the policy sorts segments by decreasing size @@ -84,9 +82,6 @@ public class TieredMergePolicy extends MergePolicy { */ public static final double DEFAULT_NO_CFS_RATIO = 0.1; - // User-specified maxMergeAtOnce. In practice we always take the min of its - // value and segsPerTier for segments above the floor size to avoid suboptimal merging. - private int maxMergeAtOnce = 30; private long maxMergedSegmentBytes = 5 * 1024 * 1024 * 1024L; private long floorSegmentBytes = 2 * 1024 * 1024L; @@ -100,36 +95,12 @@ public TieredMergePolicy() { super(DEFAULT_NO_CFS_RATIO, MergePolicy.DEFAULT_MAX_CFS_SEGMENT_SIZE); } - /** - * Maximum number of segments to be merged at a time during "normal" merging. Default is 30. - * - *

NOTE: Merges above the {@link #setFloorSegmentMB(double) floor segment size} also - * bound the number of merged segments by {@link #setSegmentsPerTier(double) the number of - * segments per tier}. - */ - public TieredMergePolicy setMaxMergeAtOnce(int v) { - if (v < 2) { - throw new IllegalArgumentException("maxMergeAtOnce must be > 1 (got " + v + ")"); - } - maxMergeAtOnce = v; - return this; - } - private enum MERGE_TYPE { NATURAL, FORCE_MERGE, FORCE_MERGE_DELETES } - /** - * Returns the current maxMergeAtOnce setting. - * - * @see #setMaxMergeAtOnce - */ - public int getMaxMergeAtOnce() { - return maxMergeAtOnce; - } - // TODO: should addIndexes do explicit merging, too? And, // if user calls IW.maybeMerge "explicitly" @@ -157,9 +128,10 @@ public double getMaxMergedSegmentMB() { } /** - * Controls the maximum percentage of deleted documents that is tolerated in the index. Lower - * values make the index more space efficient at the expense of increased CPU and I/O activity. - * Values must be between 5 and 50. Default value is 20. + * Sets the maximum percentage of doc id space taken by deleted docs. The denominator includes + * both active and deleted documents. Lower values make the index more space efficient at the + * expense of increased CPU and I/O activity. Values must be between 5 and 50. Default value is + * 20. * *

When the maximum delete percentage is lowered, the indexing thread will call for merges more * often, meaning that write amplification factor will be increased. Write amplification factor @@ -428,7 +400,7 @@ public MergeSpecification findMerges( } allowedDelCount = Math.max(0, allowedDelCount); - final int mergeFactor = (int) Math.min(maxMergeAtOnce, segsPerTier); + final int mergeFactor = (int) segsPerTier; // Compute max allowed segments for the remainder of the index long levelSize = Math.max(minSegmentBytes, floorSegmentBytes); long bytesLeft = totIndexBytes; @@ -569,7 +541,6 @@ private MergeSpecification doFindMerges( long docCountThisMerge = 0; for (int idx = startIdx; idx < sortedEligible.size() - && candidate.size() < maxMergeAtOnce // We allow merging more than mergeFactor segments together if the merged segment // would be less than the floor segment size. This is important because segments // below the floor segment size are more aggressively merged by this policy, so we @@ -732,7 +703,7 @@ protected MergeScore score( // matter in this case because this merge will not // "cascade" and so it cannot lead to N^2 merge cost // over time: - final int mergeFactor = (int) Math.min(maxMergeAtOnce, segsPerTier); + int mergeFactor = (int) segsPerTier; skew = 1.0 / mergeFactor; } else { skew = @@ -1020,7 +991,6 @@ private long floorSize(long bytes) { @Override public String toString() { StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": "); - sb.append("maxMergeAtOnce=").append(maxMergeAtOnce).append(", "); sb.append("maxMergedSegmentMB=").append(maxMergedSegmentBytes / 1024. / 1024.).append(", "); sb.append("floorSegmentMB=").append(floorSegmentBytes / 1024. / 1024.).append(", "); sb.append("forceMergeDeletesPctAllowed=").append(forceMergeDeletesPctAllowed).append(", "); diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java index 5d7dfaf8b832..a6599a57fd25 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java @@ -164,37 +164,6 @@ public long cost() { return cost; } - private void scoreDisiWrapperIntoBitSet(DisiWrapper w, Bits acceptDocs, int min, int max) - throws IOException { - boolean needsScores = BooleanScorer.this.needsScores; - FixedBitSet matching = BooleanScorer.this.matching; - Bucket[] buckets = BooleanScorer.this.buckets; - - DocIdSetIterator it = w.iterator; - Scorable scorer = w.scorable; - int doc = w.doc; - if (doc < min) { - doc = it.advance(min); - } - if (buckets == null) { - it.intoBitSet(acceptDocs, max, matching, doc & ~MASK); - } else { - for (; doc < max; doc = it.nextDoc()) { - if (acceptDocs == null || acceptDocs.get(doc)) { - final int i = doc & MASK; - matching.set(i); - final Bucket bucket = buckets[i]; - bucket.freq++; - if (needsScores) { - bucket.score += scorer.score(); - } - } - } - } - - w.doc = it.docID(); - } - private void scoreWindowIntoBitSetAndReplay( LeafCollector collector, Bits acceptDocs, @@ -207,7 +176,35 @@ private void scoreWindowIntoBitSetAndReplay( for (int i = 0; i < numScorers; ++i) { final DisiWrapper w = scorers[i]; assert w.doc < max; - scoreDisiWrapperIntoBitSet(w, acceptDocs, min, max); + + DocIdSetIterator it = w.iterator; + int doc = w.doc; + if (doc < min) { + doc = it.advance(min); + } + if (buckets == null) { + // This doesn't apply live docs, so we'll need to apply them later + it.intoBitSet(max, matching, base); + } else { + for (; doc < max; doc = it.nextDoc()) { + if (acceptDocs == null || acceptDocs.get(doc)) { + final int d = doc & MASK; + matching.set(d); + final Bucket bucket = buckets[d]; + bucket.freq++; + if (needsScores) { + bucket.score += w.scorable.score(); + } + } + } + } + + w.doc = it.docID(); + } + + if (buckets == null && acceptDocs != null) { + // In this case, live docs have not been applied yet. + acceptDocs.applyMask(matching, base); } docIdStreamView.base = base; diff --git a/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java index 2acf04ba501b..121687245248 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java @@ -105,7 +105,11 @@ private void scoreWindowUsingBitSet( assert clauseWindowMatches.scanIsEmpty(); int offset = lead.docID(); - lead.intoBitSet(acceptDocs, max, windowMatches, offset); + lead.intoBitSet(max, windowMatches, offset); + if (acceptDocs != null) { + // Apply live docs. + acceptDocs.applyMask(windowMatches, offset); + } int upTo = 0; for (; @@ -116,9 +120,7 @@ private void scoreWindowUsingBitSet( if (other.docID() < offset) { other.advance(offset); } - // No need to apply acceptDocs on other clauses since we already applied live docs on the - // leading clause. - other.intoBitSet(null, max, clauseWindowMatches, offset); + other.intoBitSet(max, clauseWindowMatches, offset); windowMatches.and(clauseWindowMatches); clauseWindowMatches.clear(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java index 034f46ed93f9..d6bdf82e48d0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.search; -import java.util.Arrays; -import java.util.Iterator; import org.apache.lucene.util.PriorityQueue; /** @@ -27,205 +25,51 @@ * * @lucene.internal */ -public final class DisiPriorityQueue implements Iterable { - - static int leftNode(int node) { - return ((node + 1) << 1) - 1; - } - - static int rightNode(int leftNode) { - return leftNode + 1; - } - - static int parentNode(int node) { - return ((node + 1) >>> 1) - 1; +public abstract sealed class DisiPriorityQueue implements Iterable + permits DisiPriorityQueue2, DisiPriorityQueueN { + + /** Create a {@link DisiPriorityQueue} of the given maximum size. */ + public static DisiPriorityQueue ofMaxSize(int maxSize) { + if (maxSize <= 2) { + return new DisiPriorityQueue2(); + } else { + return new DisiPriorityQueueN(maxSize); + } } - private final DisiWrapper[] heap; - private int size; + /** Return the number of entries in this heap. */ + public abstract int size(); - public DisiPriorityQueue(int maxSize) { - heap = new DisiWrapper[maxSize]; - size = 0; - } - - public int size() { - return size; - } - - public DisiWrapper top() { - return heap[0]; - } + /** Return top value in this heap, or null if the heap is empty. */ + public abstract DisiWrapper top(); /** Return the 2nd least value in this heap, or null if the heap contains less than 2 values. */ - public DisiWrapper top2() { - switch (size()) { - case 0: - case 1: - return null; - case 2: - return heap[1]; - default: - if (heap[1].doc <= heap[2].doc) { - return heap[1]; - } else { - return heap[2]; - } - } - } + public abstract DisiWrapper top2(); /** Get the list of scorers which are on the current doc. */ - public DisiWrapper topList() { - final DisiWrapper[] heap = this.heap; - final int size = this.size; - DisiWrapper list = heap[0]; - list.next = null; - if (size >= 3) { - list = topList(list, heap, size, 1); - list = topList(list, heap, size, 2); - } else if (size == 2 && heap[1].doc == list.doc) { - list = prepend(heap[1], list); - } - return list; - } - - // prepend w1 (iterator) to w2 (list) - private DisiWrapper prepend(DisiWrapper w1, DisiWrapper w2) { - w1.next = w2; - return w1; - } - - private DisiWrapper topList(DisiWrapper list, DisiWrapper[] heap, int size, int i) { - final DisiWrapper w = heap[i]; - if (w.doc == list.doc) { - list = prepend(w, list); - final int left = leftNode(i); - final int right = left + 1; - if (right < size) { - list = topList(list, heap, size, left); - list = topList(list, heap, size, right); - } else if (left < size && heap[left].doc == list.doc) { - list = prepend(heap[left], list); - } - } - return list; - } + public abstract DisiWrapper topList(); - public DisiWrapper add(DisiWrapper entry) { - final DisiWrapper[] heap = this.heap; - final int size = this.size; - heap[size] = entry; - upHeap(size); - this.size = size + 1; - return heap[0]; - } + /** Add a {@link DisiWrapper} to this queue and return the top entry. */ + public abstract DisiWrapper add(DisiWrapper entry); + /** Bulk add. */ public void addAll(DisiWrapper[] entries, int offset, int len) { - // Nothing to do if empty: - if (len == 0) { - return; - } - - // Fail early if we're going to over-fill: - if (size + len > heap.length) { - throw new IndexOutOfBoundsException( - "Cannot add " - + len - + " elements to a queue with remaining capacity " - + (heap.length - size)); - } - - // Copy the entries over to our heap array: - System.arraycopy(entries, offset, heap, size, len); - size += len; - - // Heapify in bulk: - final int firstLeafIndex = size >>> 1; - for (int rootIndex = firstLeafIndex - 1; rootIndex >= 0; rootIndex--) { - int parentIndex = rootIndex; - DisiWrapper parent = heap[parentIndex]; - while (parentIndex < firstLeafIndex) { - int childIndex = leftNode(parentIndex); - int rightChildIndex = rightNode(childIndex); - DisiWrapper child = heap[childIndex]; - if (rightChildIndex < size && heap[rightChildIndex].doc < child.doc) { - child = heap[rightChildIndex]; - childIndex = rightChildIndex; - } - if (child.doc >= parent.doc) { - break; - } - heap[parentIndex] = child; - parentIndex = childIndex; - } - heap[parentIndex] = parent; + for (int i = 0; i < len; ++i) { + add(entries[offset + i]); } } - public DisiWrapper pop() { - final DisiWrapper[] heap = this.heap; - final DisiWrapper result = heap[0]; - final int i = --size; - heap[0] = heap[i]; - heap[i] = null; - downHeap(i); - return result; - } + /** Remove the top entry and return it. */ + public abstract DisiWrapper pop(); - public DisiWrapper updateTop() { - downHeap(size); - return heap[0]; - } + /** Rebalance this heap and return the top entry. */ + public abstract DisiWrapper updateTop(); - DisiWrapper updateTop(DisiWrapper topReplacement) { - heap[0] = topReplacement; - return updateTop(); - } + /** + * Replace the top entry with the given entry, rebalance the heap, and return the new top entry. + */ + abstract DisiWrapper updateTop(DisiWrapper topReplacement); /** Clear the heap. */ - public void clear() { - Arrays.fill(heap, null); - size = 0; - } - - void upHeap(int i) { - final DisiWrapper node = heap[i]; - final int nodeDoc = node.doc; - int j = parentNode(i); - while (j >= 0 && nodeDoc < heap[j].doc) { - heap[i] = heap[j]; - i = j; - j = parentNode(j); - } - heap[i] = node; - } - - void downHeap(int size) { - int i = 0; - final DisiWrapper node = heap[0]; - int j = leftNode(i); - if (j < size) { - int k = rightNode(j); - if (k < size && heap[k].doc < heap[j].doc) { - j = k; - } - if (heap[j].doc < node.doc) { - do { - heap[i] = heap[j]; - i = j; - j = leftNode(i); - k = rightNode(j); - if (k < size && heap[k].doc < heap[j].doc) { - j = k; - } - } while (j < size && heap[j].doc < node.doc); - heap[i] = node; - } - } - } - - @Override - public Iterator iterator() { - return Arrays.asList(heap).subList(0, size).iterator(); - } + public abstract void clear(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java new file mode 100644 index 000000000000..b7e587382db7 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; + +/** {@link DisiPriorityQueue} of two entries or less. */ +final class DisiPriorityQueue2 extends DisiPriorityQueue { + + private DisiWrapper top, top2; + + @Override + public Iterator iterator() { + if (top2 != null) { + return Arrays.asList(top, top2).iterator(); + } else if (top != null) { + return Collections.singleton(top).iterator(); + } else { + return Collections.emptyIterator(); + } + } + + @Override + public int size() { + return top2 == null ? (top == null ? 0 : 1) : 2; + } + + @Override + public DisiWrapper top() { + return top; + } + + @Override + public DisiWrapper top2() { + return top2; + } + + @Override + public DisiWrapper topList() { + DisiWrapper topList = null; + if (top != null) { + top.next = null; + topList = top; + if (top2 != null && top.doc == top2.doc) { + top2.next = topList; + topList = top2; + } + } + return topList; + } + + @Override + public DisiWrapper add(DisiWrapper entry) { + if (top == null) { + return top = entry; + } else if (top2 == null) { + top2 = entry; + return updateTop(); + } else { + throw new IllegalStateException( + "Trying to add a 3rd element to a DisiPriorityQueue configured with a max size of 2"); + } + } + + @Override + public DisiWrapper pop() { + DisiWrapper ret = top; + top = top2; + top2 = null; + return ret; + } + + @Override + public DisiWrapper updateTop() { + if (top2 != null && top2.doc < top.doc) { + DisiWrapper tmp = top; + top = top2; + top2 = tmp; + } + return top; + } + + @Override + DisiWrapper updateTop(DisiWrapper topReplacement) { + top = topReplacement; + return updateTop(); + } + + @Override + public void clear() { + top = null; + top2 = null; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java new file mode 100644 index 000000000000..b841c3ef0ef1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.util.Arrays; +import java.util.Iterator; + +final class DisiPriorityQueueN extends DisiPriorityQueue { + + static int leftNode(int node) { + return ((node + 1) << 1) - 1; + } + + static int rightNode(int leftNode) { + return leftNode + 1; + } + + static int parentNode(int node) { + return ((node + 1) >>> 1) - 1; + } + + private final DisiWrapper[] heap; + private int size; + + DisiPriorityQueueN(int maxSize) { + heap = new DisiWrapper[maxSize]; + size = 0; + } + + @Override + public int size() { + return size; + } + + @Override + public DisiWrapper top() { + return heap[0]; + } + + @Override + public DisiWrapper top2() { + switch (size()) { + case 0: + case 1: + return null; + case 2: + return heap[1]; + default: + if (heap[1].doc <= heap[2].doc) { + return heap[1]; + } else { + return heap[2]; + } + } + } + + @Override + public DisiWrapper topList() { + final DisiWrapper[] heap = this.heap; + final int size = this.size; + DisiWrapper list = heap[0]; + list.next = null; + if (size >= 3) { + list = topList(list, heap, size, 1); + list = topList(list, heap, size, 2); + } else if (size == 2 && heap[1].doc == list.doc) { + list = prepend(heap[1], list); + } + return list; + } + + // prepend w1 (iterator) to w2 (list) + private DisiWrapper prepend(DisiWrapper w1, DisiWrapper w2) { + w1.next = w2; + return w1; + } + + private DisiWrapper topList(DisiWrapper list, DisiWrapper[] heap, int size, int i) { + final DisiWrapper w = heap[i]; + if (w.doc == list.doc) { + list = prepend(w, list); + final int left = leftNode(i); + final int right = rightNode(left); + if (right < size) { + list = topList(list, heap, size, left); + list = topList(list, heap, size, right); + } else if (left < size && heap[left].doc == list.doc) { + list = prepend(heap[left], list); + } + } + return list; + } + + @Override + public DisiWrapper add(DisiWrapper entry) { + final DisiWrapper[] heap = this.heap; + final int size = this.size; + heap[size] = entry; + upHeap(size); + this.size = size + 1; + return heap[0]; + } + + @Override + public void addAll(DisiWrapper[] entries, int offset, int len) { + // Nothing to do if empty: + if (len == 0) { + return; + } + + // Fail early if we're going to over-fill: + if (size + len > heap.length) { + throw new IndexOutOfBoundsException( + "Cannot add " + + len + + " elements to a queue with remaining capacity " + + (heap.length - size)); + } + + // Copy the entries over to our heap array: + System.arraycopy(entries, offset, heap, size, len); + size += len; + + // Heapify in bulk: + final int firstLeafIndex = size >>> 1; + for (int rootIndex = firstLeafIndex - 1; rootIndex >= 0; rootIndex--) { + int parentIndex = rootIndex; + DisiWrapper parent = heap[parentIndex]; + while (parentIndex < firstLeafIndex) { + int childIndex = leftNode(parentIndex); + int rightChildIndex = rightNode(childIndex); + DisiWrapper child = heap[childIndex]; + if (rightChildIndex < size && heap[rightChildIndex].doc < child.doc) { + child = heap[rightChildIndex]; + childIndex = rightChildIndex; + } + if (child.doc >= parent.doc) { + break; + } + heap[parentIndex] = child; + parentIndex = childIndex; + } + heap[parentIndex] = parent; + } + } + + @Override + public DisiWrapper pop() { + final DisiWrapper[] heap = this.heap; + final DisiWrapper result = heap[0]; + final int i = --size; + heap[0] = heap[i]; + heap[i] = null; + downHeap(i); + return result; + } + + @Override + public DisiWrapper updateTop() { + downHeap(size); + return heap[0]; + } + + @Override + DisiWrapper updateTop(DisiWrapper topReplacement) { + heap[0] = topReplacement; + return updateTop(); + } + + @Override + public void clear() { + Arrays.fill(heap, null); + size = 0; + } + + void upHeap(int i) { + final DisiWrapper node = heap[i]; + final int nodeDoc = node.doc; + int j = parentNode(i); + while (j >= 0 && nodeDoc < heap[j].doc) { + heap[i] = heap[j]; + i = j; + j = parentNode(j); + } + heap[i] = node; + } + + void downHeap(int size) { + int i = 0; + final DisiWrapper node = heap[0]; + int j = leftNode(i); + if (j < size) { + int k = rightNode(j); + if (k < size && heap[k].doc < heap[j].doc) { + j = k; + } + if (heap[j].doc < node.doc) { + do { + heap[i] = heap[j]; + i = j; + j = leftNode(i); + k = rightNode(j); + if (k < size && heap[k].doc < heap[j].doc) { + j = k; + } + } while (j < size && heap[j].doc < node.doc); + heap[i] = node; + } + } + } + + @Override + public Iterator iterator() { + return Arrays.asList(heap).subList(0, size).iterator(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java index 6ab57c7b180c..08018dacf9b8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java @@ -21,7 +21,6 @@ import java.util.Collection; import java.util.Comparator; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; /** @@ -92,7 +91,7 @@ public DisjunctionDISIApproximation( // Build the PQ: assert lastIdx >= -1 && lastIdx < wrappers.length - 1; int pqLen = wrappers.length - lastIdx - 1; - leadIterators = new DisiPriorityQueue(pqLen); + leadIterators = DisiPriorityQueue.ofMaxSize(pqLen); leadIterators.addAll(wrappers, lastIdx + 1, pqLen); // Build the non-PQ list: @@ -150,17 +149,16 @@ public int advance(int target) throws IOException { } @Override - public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset) - throws IOException { + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { while (leadTop.doc < upTo) { - leadTop.approximation.intoBitSet(acceptDocs, upTo, bitSet, offset); + leadTop.approximation.intoBitSet(upTo, bitSet, offset); leadTop.doc = leadTop.approximation.docID(); leadTop = leadIterators.updateTop(); } minOtherDoc = Integer.MAX_VALUE; for (DisiWrapper w : otherIterators) { - w.approximation.intoBitSet(acceptDocs, upTo, bitSet, offset); + w.approximation.intoBitSet(upTo, bitSet, offset); w.doc = w.approximation.docID(); minOtherDoc = Math.min(minOtherDoc, w.doc); } diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java new file mode 100644 index 000000000000..87912beecccb --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.util.Bits; + +/** Bulk iterator over a {@link DocIdSetIterator}. */ +public abstract class DocIdSetBulkIterator { + + /** Sole constructor, invoked by sub-classes. */ + protected DocIdSetBulkIterator() {} + + /** + * Iterate over documents contained in this iterator and call {@link LeafCollector#collect} on + * them. + */ + public abstract void iterate(LeafCollector collector, Bits acceptDocs, int min, int max) + throws IOException; +} diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java index ee30f627a56b..421323440865 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java @@ -17,7 +17,6 @@ package org.apache.lucene.search; import java.io.IOException; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; /** @@ -220,9 +219,7 @@ protected final int slowAdvance(int target) throws IOException { * *

    * for (int doc = docID(); doc < upTo; doc = nextDoc()) {
-   *   if (acceptDocs == null || acceptDocs.get(doc)) {
-   *     bitSet.set(doc - offset);
-   *   }
+   *   bitSet.set(doc - offset);
    * }
    * 
* @@ -231,15 +228,14 @@ protected final int slowAdvance(int target) throws IOException { * *

Note: It is important not to clear bits from {@code bitSet} that may be already set. * + *

Note: {@code offset} may be negative. + * * @lucene.internal */ - public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset) - throws IOException { + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { assert offset <= docID(); for (int doc = docID(); doc < upTo; doc = nextDoc()) { - if (acceptDocs == null || acceptDocs.get(doc)) { - bitSet.set(doc - offset); - } + bitSet.set(doc - offset); } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java index 35144055830c..05157ab65cb5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java @@ -46,7 +46,7 @@ public class KnnByteVectorQuery extends AbstractKnnVectorQuery { private static final TopDocs NO_RESULTS = TopDocsCollector.EMPTY_TOPDOCS; - private final byte[] target; + protected final byte[] target; /** * Find the k nearest documents to the target vector according to the vectors in the diff --git a/lucene/core/src/java/org/apache/lucene/search/KnnCollector.java b/lucene/core/src/java/org/apache/lucene/search/KnnCollector.java index 43bac9fbc309..f694d8f7085c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/KnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/KnnCollector.java @@ -85,4 +85,58 @@ public interface KnnCollector { * @return The collected top documents */ TopDocs topDocs(); + + /** + * KnnCollector.Decorator is the base class for decorators of KnnCollector objects, which extend + * the object with new behaviors. + * + * @lucene.experimental + */ + abstract class Decorator implements KnnCollector { + private final KnnCollector collector; + + public Decorator(KnnCollector collector) { + this.collector = collector; + } + + @Override + public boolean earlyTerminated() { + return collector.earlyTerminated(); + } + + @Override + public void incVisitedCount(int count) { + collector.incVisitedCount(count); + } + + @Override + public long visitedCount() { + return collector.visitedCount(); + } + + @Override + public long visitLimit() { + return collector.visitLimit(); + } + + @Override + public int k() { + return collector.k(); + } + + @Override + public boolean collect(int docId, float similarity) { + return collector.collect(docId, similarity); + } + + @Override + public float minCompetitiveSimilarity() { + return collector.minCompetitiveSimilarity(); + } + + @Override + public TopDocs topDocs() { + return collector.topDocs(); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java index d2aaf4296eda..c7d6fdb3608d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java @@ -47,7 +47,7 @@ public class KnnFloatVectorQuery extends AbstractKnnVectorQuery { private static final TopDocs NO_RESULTS = TopDocsCollector.EMPTY_TOPDOCS; - private final float[] target; + protected final float[] target; /** * Find the k nearest documents to the target vector according to the vectors in the diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java index 93dd1ea91e31..30b1d4b7e5a8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java @@ -64,7 +64,7 @@ final class MaxScoreBulkScorer extends BulkScorer { allScorers[i++] = w; } this.cost = cost; - essentialQueue = new DisiPriorityQueue(allScorers.length); + essentialQueue = DisiPriorityQueue.ofMaxSize(allScorers.length); maxScoreSums = new double[allScorers.length]; } diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java b/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java index c5372f3170a4..c08f8cdee7e4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java @@ -25,7 +25,7 @@ /** * A {@link Collector} which allows running a search with several {@link Collector}s. It offers a * static {@link #wrap} method which accepts a list of collectors and wraps them with {@link - * MultiCollector}, while filtering out the null null ones. + * MultiCollector}, while filtering out the null ones. * *

NOTE:When mixing collectors that want to skip low-scoring hits ({@link * ScoreMode#TOP_SCORES}) with ones that require to see all hits, such as mixing {@link diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiCollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/MultiCollectorManager.java index bbdfa56da156..01fa859c753d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiCollectorManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiCollectorManager.java @@ -22,8 +22,8 @@ import java.util.List; /** - * A {@link CollectorManager} implements which wrap a set of {@link CollectorManager} as {@link - * MultiCollector} acts for {@link Collector}. + * A composite {@link CollectorManager} which wraps a set of {@link CollectorManager} instances, + * akin to how {@link MultiCollector} wraps {@link Collector} instances. */ public class MultiCollectorManager implements CollectorManager { @@ -56,21 +56,21 @@ public Collector newCollector() throws IOException { } @Override - public Object[] reduce(Collection reducableCollectors) throws IOException { - final int size = reducableCollectors.size(); + public Object[] reduce(Collection reducibleCollectors) throws IOException { + final int size = reducibleCollectors.size(); final Object[] results = new Object[collectorManagers.length]; for (int i = 0; i < collectorManagers.length; i++) { - final List reducableCollector = new ArrayList<>(size); - for (Collector collector : reducableCollectors) { + final List reducibleCollector = new ArrayList<>(size); + for (Collector collector : reducibleCollectors) { // MultiCollector will not actually wrap the collector if only one is provided, so we // check the instance type here: if (collector instanceof MultiCollector) { - reducableCollector.add(((MultiCollector) collector).getCollectors()[i]); + reducibleCollector.add(((MultiCollector) collector).getCollectors()[i]); } else { - reducableCollector.add(collector); + reducibleCollector.add(collector); } } - results[i] = collectorManagers[i].reduce(reducableCollector); + results[i] = collectorManagers[i].reduce(reducibleCollector); } return results; } diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java index 1b6d6869c19e..97ae34713f86 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java @@ -188,9 +188,7 @@ public void visit(DocIdSetIterator iterator) throws IOException { @Override public void visit(IntsRef ref) { - for (int i = ref.offset; i < ref.offset + ref.length; i++) { - adder.add(ref.ints[i]); - } + adder.add(ref); } @Override @@ -235,7 +233,7 @@ public void visit(IntsRef ref) { for (int i = ref.offset; i < ref.offset + ref.length; i++) { result.clear(ref.ints[i]); } - cost[0] -= ref.length; + cost[0] = Math.max(0, cost[0] - ref.length); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/ReferenceManager.java b/lucene/core/src/java/org/apache/lucene/search/ReferenceManager.java index 699da549d0e3..e5a669d85e07 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ReferenceManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/ReferenceManager.java @@ -101,8 +101,8 @@ public final G acquire() throws IOException { if (getRefCount(ref) == 0 && current == ref) { assert ref != null; /* if we can't increment the reader but we are - still the current reference the RM is in a - illegal states since we can't make any progress + still the current reference the RM is in an + illegal state since we can't make any progress anymore. The reference is closed but the RM still holds on to it as the actual instance. This can only happen if somebody outside of the RM diff --git a/lucene/core/src/java/org/apache/lucene/search/SeededKnnByteVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/SeededKnnByteVectorQuery.java new file mode 100644 index 000000000000..980b6869c34f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/SeededKnnByteVectorQuery.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Objects; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.search.knn.KnnCollectorManager; +import org.apache.lucene.search.knn.SeededKnnCollectorManager; + +/** + * This is a version of knn byte vector query that provides a query seed to initiate the vector + * search. NOTE: The underlying format is free to ignore the provided seed + * + *

See "Lexically-Accelerated Dense + * Retrieval" (Kulkarni, Hrishikesh and MacAvaney, Sean and Goharian, Nazli and Frieder, Ophir). + * In SIGIR '23: Proceedings of the 46th International ACM SIGIR Conference on Research and + * Development in Information Retrieval Pages 152 - 162 + * + * @lucene.experimental + */ +public class SeededKnnByteVectorQuery extends KnnByteVectorQuery { + final Query seed; + final Weight seedWeight; + + /** + * Construct a new SeededKnnByteVectorQuery instance + * + * @param field knn byte vector field to query + * @param target the query vector + * @param k number of neighbors to return + * @param filter a filter on the neighbors to return + * @param seed a query seed to initiate the vector format search + */ + public SeededKnnByteVectorQuery(String field, byte[] target, int k, Query filter, Query seed) { + super(field, target, k, filter); + this.seed = Objects.requireNonNull(seed); + this.seedWeight = null; + } + + SeededKnnByteVectorQuery(String field, byte[] target, int k, Query filter, Weight seedWeight) { + super(field, target, k, filter); + this.seed = null; + this.seedWeight = Objects.requireNonNull(seedWeight); + } + + @Override + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + if (seedWeight != null) { + return super.rewrite(indexSearcher); + } + BooleanQuery.Builder booleanSeedQueryBuilder = + new BooleanQuery.Builder() + .add(seed, BooleanClause.Occur.MUST) + .add(new FieldExistsQuery(field), BooleanClause.Occur.FILTER); + if (filter != null) { + booleanSeedQueryBuilder.add(filter, BooleanClause.Occur.FILTER); + } + Query seedRewritten = indexSearcher.rewrite(booleanSeedQueryBuilder.build()); + Weight seedWeight = indexSearcher.createWeight(seedRewritten, ScoreMode.TOP_SCORES, 1f); + SeededKnnByteVectorQuery rewritten = + new SeededKnnByteVectorQuery(field, target, k, filter, seedWeight); + return rewritten.rewrite(indexSearcher); + } + + @Override + protected KnnCollectorManager getKnnCollectorManager(int k, IndexSearcher searcher) { + if (seedWeight == null) { + throw new UnsupportedOperationException("must be rewritten before constructing manager"); + } + return new SeededKnnCollectorManager( + super.getKnnCollectorManager(k, searcher), + seedWeight, + k, + leaf -> { + ByteVectorValues vv = leaf.getByteVectorValues(field); + if (vv == null) { + ByteVectorValues.checkField(leaf.getContext().reader(), field); + } + return vv; + }); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/SeededKnnFloatVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/SeededKnnFloatVectorQuery.java new file mode 100644 index 000000000000..02a33bdcdef7 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/SeededKnnFloatVectorQuery.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Objects; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.search.knn.KnnCollectorManager; +import org.apache.lucene.search.knn.SeededKnnCollectorManager; + +/** + * This is a version of knn float vector query that provides a query seed to initiate the vector + * search. NOTE: The underlying format is free to ignore the provided seed. + * + *

See "Lexically-Accelerated Dense + * Retrieval" (Kulkarni, Hrishikesh and MacAvaney, Sean and Goharian, Nazli and Frieder, Ophir). + * In SIGIR '23: Proceedings of the 46th International ACM SIGIR Conference on Research and + * Development in Information Retrieval Pages 152 - 162 + * + * @lucene.experimental + */ +public class SeededKnnFloatVectorQuery extends KnnFloatVectorQuery { + final Query seed; + final Weight seedWeight; + + /** + * Construct a new SeededKnnFloatVectorQuery instance + * + * @param field knn float vector field to query + * @param target the query vector + * @param k number of neighbors to return + * @param filter a filter on the neighbors to return + * @param seed a query seed to initiate the vector format search + */ + public SeededKnnFloatVectorQuery(String field, float[] target, int k, Query filter, Query seed) { + super(field, target, k, filter); + this.seed = Objects.requireNonNull(seed); + this.seedWeight = null; + } + + SeededKnnFloatVectorQuery(String field, float[] target, int k, Query filter, Weight seedWeight) { + super(field, target, k, filter); + this.seed = null; + this.seedWeight = Objects.requireNonNull(seedWeight); + } + + @Override + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + if (seedWeight != null) { + return super.rewrite(indexSearcher); + } + BooleanQuery.Builder booleanSeedQueryBuilder = + new BooleanQuery.Builder() + .add(seed, BooleanClause.Occur.MUST) + .add(new FieldExistsQuery(field), BooleanClause.Occur.FILTER); + if (filter != null) { + booleanSeedQueryBuilder.add(filter, BooleanClause.Occur.FILTER); + } + Query seedRewritten = indexSearcher.rewrite(booleanSeedQueryBuilder.build()); + Weight seedWeight = indexSearcher.createWeight(seedRewritten, ScoreMode.TOP_SCORES, 1f); + SeededKnnFloatVectorQuery rewritten = + new SeededKnnFloatVectorQuery(field, target, k, filter, seedWeight); + return rewritten.rewrite(indexSearcher); + } + + @Override + protected KnnCollectorManager getKnnCollectorManager(int k, IndexSearcher searcher) { + if (seedWeight == null) { + throw new UnsupportedOperationException("must be rewritten before constructing manager"); + } + return new SeededKnnCollectorManager( + super.getKnnCollectorManager(k, searcher), + seedWeight, + k, + leaf -> { + FloatVectorValues vv = leaf.getFloatVectorValues(field); + if (vv == null) { + FloatVectorValues.checkField(leaf.getContext().reader(), field); + } + return vv; + }); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java index 2a1f312fbc58..2dc2f035b90f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java @@ -45,51 +45,19 @@ public KnnCollector newCollector(int visitedLimit, LeafReaderContext context) th return new TimeLimitingKnnCollector(collector); } - class TimeLimitingKnnCollector implements KnnCollector { - private final KnnCollector collector; - - TimeLimitingKnnCollector(KnnCollector collector) { - this.collector = collector; + class TimeLimitingKnnCollector extends KnnCollector.Decorator { + public TimeLimitingKnnCollector(KnnCollector collector) { + super(collector); } @Override public boolean earlyTerminated() { - return queryTimeout.shouldExit() || collector.earlyTerminated(); - } - - @Override - public void incVisitedCount(int count) { - collector.incVisitedCount(count); - } - - @Override - public long visitedCount() { - return collector.visitedCount(); - } - - @Override - public long visitLimit() { - return collector.visitLimit(); - } - - @Override - public int k() { - return collector.k(); - } - - @Override - public boolean collect(int docId, float similarity) { - return collector.collect(docId, similarity); - } - - @Override - public float minCompetitiveSimilarity() { - return collector.minCompetitiveSimilarity(); + return queryTimeout.shouldExit() || super.earlyTerminated(); } @Override public TopDocs topDocs() { - TopDocs docs = collector.topDocs(); + TopDocs docs = super.topDocs(); // Mark results as partial if timeout is met TotalHits.Relation relation = diff --git a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java index 897713dbe17d..88ffa4a0c62e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java @@ -16,9 +16,9 @@ */ package org.apache.lucene.search; -import static org.apache.lucene.search.DisiPriorityQueue.leftNode; -import static org.apache.lucene.search.DisiPriorityQueue.parentNode; -import static org.apache.lucene.search.DisiPriorityQueue.rightNode; +import static org.apache.lucene.search.DisiPriorityQueueN.leftNode; +import static org.apache.lucene.search.DisiPriorityQueueN.parentNode; +import static org.apache.lucene.search.DisiPriorityQueueN.rightNode; import static org.apache.lucene.search.ScorerUtil.costWithMinShouldMatch; import java.io.IOException; @@ -170,7 +170,7 @@ private static long scaleMinScore(float minScore, int scalingFactor) { this.scoreMode = scoreMode; - head = new DisiPriorityQueue(scorers.size()); + head = DisiPriorityQueue.ofMaxSize(scorers.size()); // there can be at most num_scorers - 1 scorers beyond the current position tail = new DisiWrapper[scorers.size()]; diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/EntryPointProvider.java b/lucene/core/src/java/org/apache/lucene/search/knn/EntryPointProvider.java new file mode 100644 index 000000000000..9e7b44b571df --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/knn/EntryPointProvider.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.knn; + +import org.apache.lucene.search.DocIdSetIterator; + +/** Provides entry points for the kNN search */ +public interface EntryPointProvider { + /** Iterator of valid entry points for the kNN search */ + DocIdSetIterator entryPoints(); + + /** Number of valid entry points for the kNN search */ + int numberOfEntryPoints(); +} diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java b/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java index 051cd9ed6339..6b5e398d7087 100644 --- a/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java @@ -19,7 +19,6 @@ import org.apache.lucene.search.AbstractKnnCollector; import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.hnsw.BlockingFloatHeap; import org.apache.lucene.util.hnsw.FloatHeap; @@ -29,7 +28,7 @@ * * @lucene.experimental */ -public final class MultiLeafKnnCollector implements KnnCollector { +public final class MultiLeafKnnCollector extends KnnCollector.Decorator { // greediness of globally non-competitive search: (0,1] private static final float DEFAULT_GREEDINESS = 0.9f; @@ -77,6 +76,7 @@ public MultiLeafKnnCollector( int interval, BlockingFloatHeap globalSimilarityQueue, AbstractKnnCollector subCollector) { + super(subCollector); if (greediness < 0 || greediness > 1) { throw new IllegalArgumentException("greediness must be in [0,1]"); } @@ -91,31 +91,6 @@ public MultiLeafKnnCollector( this.updatesScratch = new float[k]; } - @Override - public boolean earlyTerminated() { - return subCollector.earlyTerminated(); - } - - @Override - public void incVisitedCount(int count) { - subCollector.incVisitedCount(count); - } - - @Override - public long visitedCount() { - return subCollector.visitedCount(); - } - - @Override - public long visitLimit() { - return subCollector.visitLimit(); - } - - @Override - public int k() { - return subCollector.k(); - } - @Override public boolean collect(int docId, float similarity) { boolean localSimUpdated = subCollector.collect(docId, similarity); @@ -157,11 +132,6 @@ public float minCompetitiveSimilarity() { Math.min(nonCompetitiveQueue.peek(), cachedGlobalMinSim)); } - @Override - public TopDocs topDocs() { - return subCollector.topDocs(); - } - @Override public String toString() { return "MultiLeafKnnCollector[subCollector=" + subCollector + "]"; diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollector.java b/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollector.java new file mode 100644 index 000000000000..c3c4f62901ee --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollector.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.knn; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.KnnCollector; + +/** + * A {@link KnnCollector} that provides seeded knn collection. See usage in {@link + * SeededKnnCollectorManager}. + * + * @lucene.experimental + */ +class SeededKnnCollector extends KnnCollector.Decorator implements EntryPointProvider { + private final DocIdSetIterator entryPoints; + private final int numberOfEntryPoints; + + SeededKnnCollector( + KnnCollector collector, DocIdSetIterator entryPoints, int numberOfEntryPoints) { + super(collector); + this.entryPoints = entryPoints; + this.numberOfEntryPoints = numberOfEntryPoints; + } + + @Override + public DocIdSetIterator entryPoints() { + return entryPoints; + } + + @Override + public int numberOfEntryPoints() { + return numberOfEntryPoints; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollectorManager.java new file mode 100644 index 000000000000..7631db6e3022 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollectorManager.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.knn; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.index.KnnVectorValues; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.BulkScorer; +import org.apache.lucene.search.CollectionTerminatedException; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.TopScoreDocCollectorManager; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.IOFunction; + +/** + * A {@link KnnCollectorManager} that provides seeded knn collection. See usage in {@link + * org.apache.lucene.search.SeededKnnFloatVectorQuery} and {@link + * org.apache.lucene.search.SeededKnnByteVectorQuery}. + */ +public class SeededKnnCollectorManager implements KnnCollectorManager { + private final KnnCollectorManager delegate; + private final Weight seedWeight; + private final int k; + private final IOFunction vectorValuesSupplier; + + public SeededKnnCollectorManager( + KnnCollectorManager delegate, + Weight seedWeight, + int k, + IOFunction vectorValuesSupplier) { + this.delegate = delegate; + this.seedWeight = seedWeight; + this.k = k; + this.vectorValuesSupplier = vectorValuesSupplier; + } + + @Override + public KnnCollector newCollector(int visitedLimit, LeafReaderContext ctx) throws IOException { + // Execute the seed query + TopScoreDocCollector seedCollector = + new TopScoreDocCollectorManager(k, null, Integer.MAX_VALUE).newCollector(); + final LeafReader leafReader = ctx.reader(); + final LeafCollector leafCollector = seedCollector.getLeafCollector(ctx); + if (leafCollector != null) { + try { + BulkScorer scorer = seedWeight.bulkScorer(ctx); + if (scorer != null) { + scorer.score( + leafCollector, + leafReader.getLiveDocs(), + 0 /* min */, + DocIdSetIterator.NO_MORE_DOCS /* max */); + } + } catch ( + @SuppressWarnings("unused") + CollectionTerminatedException e) { + } + leafCollector.finish(); + } + + TopDocs seedTopDocs = seedCollector.topDocs(); + KnnVectorValues vectorValues = vectorValuesSupplier.apply(leafReader); + final KnnCollector delegateCollector = delegate.newCollector(visitedLimit, ctx); + if (seedTopDocs.totalHits.value() == 0 || vectorValues == null) { + return delegateCollector; + } + KnnVectorValues.DocIndexIterator indexIterator = vectorValues.iterator(); + DocIdSetIterator seedDocs = new MappedDISI(indexIterator, new TopDocsDISI(seedTopDocs)); + return new SeededKnnCollector(delegateCollector, seedDocs, seedTopDocs.scoreDocs.length); + } + + private static class MappedDISI extends DocIdSetIterator { + KnnVectorValues.DocIndexIterator indexedDISI; + DocIdSetIterator sourceDISI; + + private MappedDISI(KnnVectorValues.DocIndexIterator indexedDISI, DocIdSetIterator sourceDISI) { + this.indexedDISI = indexedDISI; + this.sourceDISI = sourceDISI; + } + + /** + * Advances the source iterator to the first document number that is greater than or equal to + * the provided target and returns the corresponding index. + */ + @Override + public int advance(int target) throws IOException { + int newTarget = sourceDISI.advance(target); + if (newTarget != NO_MORE_DOCS) { + indexedDISI.advance(newTarget); + } + return docID(); + } + + @Override + public long cost() { + return sourceDISI.cost(); + } + + @Override + public int docID() { + if (indexedDISI.docID() == NO_MORE_DOCS || sourceDISI.docID() == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + return indexedDISI.index(); + } + + /** Advances to the next document in the source iterator and returns the corresponding index. */ + @Override + public int nextDoc() throws IOException { + int newTarget = sourceDISI.nextDoc(); + if (newTarget != NO_MORE_DOCS) { + indexedDISI.advance(newTarget); + } + return docID(); + } + } + + private static class TopDocsDISI extends DocIdSetIterator { + private final int[] sortedDocIds; + private int idx = -1; + + private TopDocsDISI(TopDocs topDocs) { + sortedDocIds = new int[topDocs.scoreDocs.length]; + for (int i = 0; i < topDocs.scoreDocs.length; i++) { + sortedDocIds[i] = topDocs.scoreDocs[i].doc; + } + Arrays.sort(sortedDocIds); + } + + @Override + public int advance(int target) throws IOException { + return slowAdvance(target); + } + + @Override + public long cost() { + return sortedDocIds.length; + } + + @Override + public int docID() { + if (idx == -1) { + return -1; + } else if (idx >= sortedDocIds.length) { + return DocIdSetIterator.NO_MORE_DOCS; + } else { + return sortedDocIds[idx]; + } + } + + @Override + public int nextDoc() { + idx += 1; + return docID(); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index 1738259fa2fb..cd47500a2df7 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -401,7 +401,7 @@ private static final class SlicedIndexInput extends BufferedIndexInput { ? base.toString() : (base.toString() + " [slice=" + sliceDescription + "]"), BufferedIndexInput.BUFFER_SIZE); - if (offset < 0 || length < 0 || offset + length > base.length()) { + if ((length | offset) < 0 || length > base.length() - offset) { throw new IllegalArgumentException( "slice() " + sliceDescription + " out of bounds: " + base); } diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java index 39e920616209..dee5c8e3a738 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java @@ -424,7 +424,7 @@ public void skipBytes(long numBytes) throws IOException { } public ByteBuffersDataInput slice(long offset, long length) { - if (offset < 0 || length < 0 || offset + length > this.length) { + if ((length | offset) < 0 || length > this.length - offset) { throw new IllegalArgumentException( String.format( Locale.ROOT, diff --git a/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java b/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java index 413e22c45ae8..5039c779097a 100644 --- a/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java @@ -60,7 +60,7 @@ * post. *

  • {@link NIOFSDirectory} uses java.nio's FileChannel's positional io when reading to avoid * synchronization when reading from the same file. Unfortunately, due to a Windows-only Sun JRE bug this is a + * href="https://bugs.java.com/bugdatabase/view_bug?bug_id=6265734">Sun JRE bug this is a * poor choice for Windows, but on all other platforms this is the preferred choice. * Applications using {@link Thread#interrupt()} or {@link Future#cancel(boolean)} should use * {@code RAFDirectory} instead, which is provided in the {@code misc} module. See {@link @@ -349,7 +349,7 @@ private void privateDeleteFile(String name, boolean isPendingDelete) throws IOEx // a WindowsFSDirectory ... // LUCENE-6684: we suppress this check for Windows, since a file could be in a confusing // "pending delete" state, failing the first - // delete attempt with access denied and then apparently falsely failing here when we try ot + // delete attempt with access denied and then apparently falsely failing here when we try to // delete it again, with NSFE/FNFE } else { throw e; diff --git a/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java index 9e60a51790f9..933701b3c3de 100644 --- a/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java @@ -21,7 +21,7 @@ import org.apache.lucene.internal.tests.TestSecrets; /** - * IndexInput implementation that delegates calls to another directory. This class can be used to + * IndexInput implementation that delegates calls to another IndexInput. This class can be used to * add limitations on top of an existing {@link IndexInput} implementation or to add additional * sanity checks for tests. However, if you plan to write your own {@link IndexInput} * implementation, you should consider extending directly {@link IndexInput} or {@link DataInput} diff --git a/lucene/core/src/java/org/apache/lucene/store/IOContext.java b/lucene/core/src/java/org/apache/lucene/store/IOContext.java index 91f3822dbc13..5f341609748f 100644 --- a/lucene/core/src/java/org/apache/lucene/store/IOContext.java +++ b/lucene/core/src/java/org/apache/lucene/store/IOContext.java @@ -34,9 +34,7 @@ public record IOContext( Context context, MergeInfo mergeInfo, FlushInfo flushInfo, ReadAdvice readAdvice) { - /** - * Context is a enumerator which specifies the context in which the Directory is being used for. - */ + /** Context is an enumerator which specifies the context in which the Directory is being used. */ public enum Context { /** Context for reads and writes that are associated with a merge. */ MERGE, diff --git a/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java b/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java index 246f48082cfe..b05652789cfa 100644 --- a/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java @@ -36,7 +36,7 @@ *

    NOTE: NIOFSDirectory is not recommended on Windows because of a bug in how * FileChannel.read is implemented in Sun's JRE. Inside of the implementation the position is * apparently synchronized. See here for details. + * href="https://bugs.java.com/bugdatabase/view_bug?bug_id=6265734">here for details. * *

    NOTE: Accessing this class either directly or indirectly from a thread while it's * interrupted can close the underlying file descriptor immediately if at the same time the thread @@ -139,7 +139,7 @@ public NIOFSIndexInput clone() { @Override public IndexInput slice(String sliceDescription, long offset, long length) throws IOException { - if (offset < 0 || length < 0 || offset + length > this.length()) { + if ((length | offset) < 0 || length > this.length() - offset) { throw new IllegalArgumentException( "slice() " + sliceDescription diff --git a/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java b/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java index 16d6aa22ce53..35a300c1763a 100644 --- a/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java @@ -56,8 +56,8 @@ * * *

    This will cache all newly flushed segments, all merges whose expected segment size is {@code - * <= 5 MB}, unless the net cached bytes exceeds 60 MB at which point all writes will not be cached - * (until the net bytes falls below 60 MB). + * <= 5 MB}, unless the net cached bytes exceed 60 MB at which point all writes will not be cached + * (until the net bytes fall below 60 MB). * * @lucene.experimental */ diff --git a/lucene/core/src/java/org/apache/lucene/store/RateLimitedIndexOutput.java b/lucene/core/src/java/org/apache/lucene/store/RateLimitedIndexOutput.java index bfac505b8b09..86c7ca3885f3 100644 --- a/lucene/core/src/java/org/apache/lucene/store/RateLimitedIndexOutput.java +++ b/lucene/core/src/java/org/apache/lucene/store/RateLimitedIndexOutput.java @@ -31,7 +31,7 @@ public final class RateLimitedIndexOutput extends FilterIndexOutput { private long bytesSinceLastPause; /** - * Cached here not not always have to call RateLimiter#getMinPauseCheckBytes() which does volatile + * Cached here to not always have to call RateLimiter#getMinPauseCheckBytes() which does volatile * read. */ private long currentMinPauseCheckBytes; diff --git a/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java b/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java index 4d7c83057cbe..ba55573baf0e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java @@ -99,20 +99,16 @@ public long cost() { } @Override - public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset) - throws IOException { - // TODO: Can we also optimize the case when acceptDocs is not null? - if (acceptDocs == null - && offset < bits.length() - && bits instanceof FixedBitSet fixedBits - // no bits are set between `offset` and `doc` - && fixedBits.nextSetBit(offset) == doc - // the whole `bitSet` is getting filled - && (upTo - offset == bitSet.length())) { - bitSet.orRange(fixedBits, offset); - advance(upTo); // set the current doc - } else { - super.intoBitSet(acceptDocs, upTo, bitSet, offset); + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + if (upTo > doc && bits instanceof FixedBitSet fixedBits) { + int actualUpto = Math.min(upTo, length); + // The destination bit set may be shorter than this bit set. This is only legal if all bits + // beyond offset + bitSet.length() are clear. If not, the below call to `super.intoBitSet` + // will throw an exception. + actualUpto = (int) Math.min(actualUpto, offset + (long) bitSet.length()); + FixedBitSet.orRange(fixedBits, doc, bitSet, doc - offset, actualUpto - doc); + advance(actualUpto); // set the current doc } + super.intoBitSet(upTo, bitSet, offset); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/Bits.java b/lucene/core/src/java/org/apache/lucene/util/Bits.java index dd42ad4b1973..61757a1a34e4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/Bits.java +++ b/lucene/core/src/java/org/apache/lucene/util/Bits.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.util; +import org.apache.lucene.search.DocIdSetIterator; + /** * Interface for Bitset-like structures. * @@ -34,6 +36,32 @@ public interface Bits { /** Returns the number of bits in this set */ int length(); + /** + * Apply this {@code Bits} instance to the given {@link FixedBitSet}, which starts at the given + * {@code offset}. + * + *

    This should behave the same way as the default implementation, which does the following: + * + *

    +   * for (int i = bitSet.nextSetBit(0);
    +   *     i != DocIdSetIterator.NO_MORE_DOCS;
    +   *     i = i + 1 >= bitSet.length() ? DocIdSetIterator.NO_MORE_DOCS : bitSet.nextSetBit(i + 1)) {
    +   *   if (get(offset + i) == false) {
    +   *     bitSet.clear(i);
    +   *   }
    +   * }
    +   * 
    + */ + default void applyMask(FixedBitSet bitSet, int offset) { + for (int i = bitSet.nextSetBit(0); + i != DocIdSetIterator.NO_MORE_DOCS; + i = i + 1 >= bitSet.length() ? DocIdSetIterator.NO_MORE_DOCS : bitSet.nextSetBit(i + 1)) { + if (get(offset + i) == false) { + bitSet.clear(i); + } + } + } + Bits[] EMPTY_ARRAY = new Bits[0]; /** Bits impl of the specified length with all bits set. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java b/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java index 841149f4febe..cf9bf5432b46 100644 --- a/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.util; +import java.io.IOException; import org.apache.lucene.search.DocIdSetIterator; /** @@ -89,4 +90,18 @@ public int advance(int target) { public long cost() { return cost; } + + @Override + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + int actualUpto = Math.min(upTo, length); + // The destination bit set may be shorter than this bit set. This is only legal if all bits + // beyond offset + bitSet.length() are clear. If not, the below call to `super.intoBitSet` will + // throw an exception. + actualUpto = (int) Math.min(actualUpto, offset + (long) bitSet.length()); + if (actualUpto > doc) { + FixedBitSet.orRange(bits, doc - docBase, bitSet, doc - offset, actualUpto - doc); + advance(actualUpto); // set the current doc + } + super.intoBitSet(upTo, bitSet, offset); + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java index 28128af05f67..159cef025678 100644 --- a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java @@ -41,29 +41,28 @@ public final class DocIdSetBuilder { * * @see DocIdSetBuilder#grow */ - public abstract static class BulkAdder { - public abstract void add(int doc); + public sealed interface BulkAdder permits FixedBitSetAdder, BufferAdder { + void add(int doc); - public void add(DocIdSetIterator iterator) throws IOException { - int docID; - while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - add(docID); - } - } - } + void add(IntsRef docs); - private static class FixedBitSetAdder extends BulkAdder { - final FixedBitSet bitSet; + void add(DocIdSetIterator iterator) throws IOException; + } - FixedBitSetAdder(FixedBitSet bitSet) { - this.bitSet = bitSet; - } + private record FixedBitSetAdder(FixedBitSet bitSet) implements BulkAdder { @Override public void add(int doc) { bitSet.set(doc); } + @Override + public void add(IntsRef docs) { + for (int i = 0; i < docs.length; i++) { + bitSet.set(docs.ints[docs.offset + i]); + } + } + @Override public void add(DocIdSetIterator iterator) throws IOException { bitSet.or(iterator); @@ -85,17 +84,26 @@ private static class Buffer { } } - private static class BufferAdder extends BulkAdder { - final Buffer buffer; - - BufferAdder(Buffer buffer) { - this.buffer = buffer; - } + private record BufferAdder(Buffer buffer) implements BulkAdder { @Override public void add(int doc) { buffer.array[buffer.length++] = doc; } + + @Override + public void add(IntsRef docs) { + System.arraycopy(docs.ints, docs.offset, buffer.array, buffer.length, docs.length); + buffer.length += docs.length; + } + + @Override + public void add(DocIdSetIterator iterator) throws IOException { + int docID; + while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + add(docID); + } + } } private final int maxDoc; diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java index 584f30b3baac..9867582dd522 100644 --- a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Objects; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; @@ -338,66 +339,155 @@ public int prevSetBit(int index) { @Override public void or(DocIdSetIterator iter) throws IOException { - if (iter instanceof DocBaseBitSetIterator) { - // TODO: implement DocBaseBitSetIterator#intoBitSet instead - checkUnpositioned(iter); - DocBaseBitSetIterator baseIter = (DocBaseBitSetIterator) iter; - or(baseIter.getDocBase() >> 6, baseIter.getBitSet()); - } else { - checkUnpositioned(iter); - iter.nextDoc(); - iter.intoBitSet(null, DocIdSetIterator.NO_MORE_DOCS, this, 0); - } + checkUnpositioned(iter); + iter.nextDoc(); + iter.intoBitSet(DocIdSetIterator.NO_MORE_DOCS, this, 0); } - private void or(final int otherOffsetWords, FixedBitSet other) { - or(otherOffsetWords, other.bits, other.numWords); + /** Read {@code numBits} (between 1 and 63) bits from {@code bitSet} at {@code from}. */ + private static long readNBits(long[] bitSet, int from, int numBits) { + assert numBits > 0 && numBits < Long.SIZE; + long bits = bitSet[from >> 6] >>> from; + int numBitsSoFar = Long.SIZE - (from & 0x3F); + if (numBitsSoFar < numBits) { + bits |= bitSet[(from >> 6) + 1] << -from; + } + return bits & ((1L << numBits) - 1); } - private void or(final int otherOffsetWords, final long[] otherArr, final int otherNumWords) { - assert otherNumWords + otherOffsetWords <= numWords - : "numWords=" + numWords + ", otherNumWords=" + otherNumWords; - int pos = Math.min(numWords - otherOffsetWords, otherNumWords); - final long[] thisArr = this.bits; - while (--pos >= 0) { - thisArr[pos + otherOffsetWords] |= otherArr[pos]; + /** + * Or {@code length} bits starting at {@code sourceFrom} from {@code source} into {@code dest} + * starting at {@code destFrom}. + */ + public static void orRange( + FixedBitSet source, int sourceFrom, FixedBitSet dest, int destFrom, int length) { + assert length >= 0; + Objects.checkFromIndexSize(sourceFrom, length, source.length()); + Objects.checkFromIndexSize(destFrom, length, dest.length()); + + if (length == 0) { + return; + } + + long[] sourceBits = source.getBits(); + long[] destBits = dest.getBits(); + + // First, align `destFrom` with a word start, ie. a multiple of Long.SIZE (64) + if ((destFrom & 0x3F) != 0) { + int numBitsNeeded = Math.min(-destFrom & 0x3F, length); + long bits = readNBits(sourceBits, sourceFrom, numBitsNeeded) << destFrom; + destBits[destFrom >> 6] |= bits; + + sourceFrom += numBitsNeeded; + destFrom += numBitsNeeded; + length -= numBitsNeeded; + } + + if (length == 0) { + return; + } + + assert (destFrom & 0x3F) == 0; + + // Now OR at the word level + int numFullWords = length >> 6; + int sourceWordFrom = sourceFrom >> 6; + int destWordFrom = destFrom >> 6; + + // Note: these two for loops auto-vectorize + if ((sourceFrom & 0x3F) == 0) { + // sourceFrom and destFrom are both aligned with a long[] + for (int i = 0; i < numFullWords; ++i) { + destBits[destWordFrom + i] |= sourceBits[sourceWordFrom + i]; + } + } else { + for (int i = 0; i < numFullWords; ++i) { + destBits[destWordFrom + i] |= + (sourceBits[sourceWordFrom + i] >>> sourceFrom) + | (sourceBits[sourceWordFrom + i + 1] << -sourceFrom); + } + } + + sourceFrom += numFullWords << 6; + destFrom += numFullWords << 6; + length -= numFullWords << 6; + + // Finally handle tail bits + if (length > 0) { + long bits = readNBits(sourceBits, sourceFrom, length); + destBits[destFrom >> 6] |= bits; } } /** - * Or {@code min(length(), other.length() - from} bits starting at {@code from} from {@code other} - * into this bit set starting at 0. + * And {@code length} bits starting at {@code sourceFrom} from {@code source} into {@code dest} + * starting at {@code destFrom}. */ - void orRange(FixedBitSet other, int from) { - int numBits = Math.min(length(), other.length() - from); - if (numBits <= 0) { + public static void andRange( + FixedBitSet source, int sourceFrom, FixedBitSet dest, int destFrom, int length) { + assert length >= 0 : length; + Objects.checkFromIndexSize(sourceFrom, length, source.length()); + Objects.checkFromIndexSize(destFrom, length, dest.length()); + + if (length == 0) { return; } - int numFullWords = numBits >> 6; - long[] otherBits = other.getBits(); - int wordOffset = from >> 6; - if ((from & 0x3F) == 0) { - // from is aligned with a long[] + + long[] sourceBits = source.getBits(); + long[] destBits = dest.getBits(); + + // First, align `destFrom` with a word start, ie. a multiple of Long.SIZE (64) + if ((destFrom & 0x3F) != 0) { + int numBitsNeeded = Math.min(-destFrom & 0x3F, length); + long bits = readNBits(sourceBits, sourceFrom, numBitsNeeded) << destFrom; + bits |= ~(((1L << numBitsNeeded) - 1) << destFrom); + destBits[destFrom >> 6] &= bits; + + sourceFrom += numBitsNeeded; + destFrom += numBitsNeeded; + length -= numBitsNeeded; + } + + if (length == 0) { + return; + } + + assert (destFrom & 0x3F) == 0; + + // Now AND at the word level + int numFullWords = length >> 6; + int sourceWordFrom = sourceFrom >> 6; + int destWordFrom = destFrom >> 6; + + // Note: these two for loops auto-vectorize + if ((sourceFrom & 0x3F) == 0) { + // sourceFrom and destFrom are both aligned with a long[] for (int i = 0; i < numFullWords; ++i) { - bits[i] |= otherBits[wordOffset + i]; + destBits[destWordFrom + i] &= sourceBits[sourceWordFrom + i]; } } else { for (int i = 0; i < numFullWords; ++i) { - bits[i] |= (otherBits[wordOffset + i] >>> from) | (otherBits[wordOffset + i + 1] << -from); + destBits[destWordFrom + i] &= + (sourceBits[sourceWordFrom + i] >>> sourceFrom) + | (sourceBits[sourceWordFrom + i + 1] << -sourceFrom); } } - // Handle the remainder - for (int i = numFullWords << 6; i < numBits; ++i) { - if (other.get(from + i)) { - set(i); - } + sourceFrom += numFullWords << 6; + destFrom += numFullWords << 6; + length -= numFullWords << 6; + + // Finally handle tail bits + if (length > 0) { + long bits = readNBits(sourceBits, sourceFrom, length); + bits |= (~0L << length); + destBits[destFrom >> 6] &= bits; } } /** this = this OR other */ public void or(FixedBitSet other) { - orRange(other, 0); + orRange(other, 0, this, 0, other.length()); } /** this = this XOR other */ @@ -687,4 +777,18 @@ public static FixedBitSet copyOf(Bits bits) { public Bits asReadOnlyBits() { return new FixedBits(bits, numBits); } + + @Override + public void applyMask(FixedBitSet bitSet, int offset) { + // Note: Some scorers don't track maxDoc and may thus call this method with an offset that is + // beyond bitSet.length() + int length = Math.min(bitSet.length(), length() - offset); + if (length >= 0) { + andRange(this, offset, bitSet, 0, length); + } + if (length < bitSet.length() + && bitSet.nextSetBit(Math.max(0, length)) != DocIdSetIterator.NO_MORE_DOCS) { + throw new IllegalArgumentException("Some bits are set beyond the end of live docs"); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java b/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java index d44cc7839233..4f764b37dfd9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java @@ -95,6 +95,21 @@ public int advance(int target) throws IOException { return doc = docs[i++]; } + @Override + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + if (doc >= upTo) { + return; + } + + int from = i - 1; + int to = VectorUtil.findNextGEQ(docs, upTo, from, length); + for (int i = from; i < to; ++i) { + bitSet.set(docs[i] - offset); + } + doc = docs[to]; + i = to + 1; + } + @Override public long cost() { return length; diff --git a/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java b/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java index ccd92a74250e..77038dd07eda 100644 --- a/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java @@ -217,6 +217,20 @@ public int advance(int target) throws IOException { return doc = docId(i); } } + + @Override + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + if (doc >= upTo) { + return; + } + + int from = i; + advance(upTo); + int to = i; + for (int i = from; i < to; ++i) { + bitSet.set(docId(i) - offset); + } + } }; } } @@ -312,6 +326,26 @@ private int firstDocFromNextBlock() throws IOException { } } + @Override + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + for (; ; ) { + int subUpto = upTo - (block << 16); + if (subUpto < 0) { + break; + } + int subOffset = offset - (block << 16); + sub.intoBitSet(subUpto, bitSet, subOffset); + if (sub.docID() == NO_MORE_DOCS) { + if (firstDocFromNextBlock() == NO_MORE_DOCS) { + break; + } + } else { + doc = (block << 16) | sub.docID(); + break; + } + } + } + @Override public long cost() { return cardinality; diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java index 0efcc2ef4650..a90c79a8c808 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java @@ -24,6 +24,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.MathUtil; /** @@ -146,6 +147,19 @@ public void visit(int docID) { count[0]++; } + @Override + public void visit(DocIdSetIterator iterator) throws IOException { + int docID; + while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + visit(docID); + } + } + + @Override + public void visit(IntsRef ref) { + count[0] += ref.length; + } + @Override public void visit(int docID, byte[] packedValue) { throw new AssertionError(); diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java index d2e81addc5d4..d9d58c829d3d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java @@ -90,9 +90,7 @@ public OnHeapHnswGraph build(int maxOrd) throws IOException { }); } taskExecutor.invokeAll(futures); - finish(); - frozen = true; - return workers[0].getCompletedGraph(); + return getCompletedGraph(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index 46d6c93d52c3..e8f0d316fd81 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -20,8 +20,10 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.TopKnnCollector; +import org.apache.lucene.search.knn.EntryPointProvider; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; @@ -52,7 +54,9 @@ public HnswGraphSearcher(NeighborQueue candidates, BitSet visited) { } /** - * Searches HNSW graph for the nearest neighbors of a query vector. + * Searches the HNSW graph for the nearest neighbors of a query vector. If entry points are + * directly provided via the knnCollector, then the search will be initialized at those points. + * Otherwise, the search will discover the best entry point per the normal HNSW search algorithm. * * @param scorer the scorer to compare the query with the nodes * @param knnCollector a collector of top knn results to be returned @@ -67,7 +71,30 @@ public static void search( HnswGraphSearcher graphSearcher = new HnswGraphSearcher( new NeighborQueue(knnCollector.k(), true), new SparseFixedBitSet(getGraphSize(graph))); - search(scorer, knnCollector, graph, graphSearcher, acceptOrds); + final int[] entryPoints; + if (knnCollector instanceof EntryPointProvider epp) { + if (epp.numberOfEntryPoints() <= 0) { + throw new IllegalArgumentException("The number of entry points must be > 0"); + } + DocIdSetIterator eps = epp.entryPoints(); + entryPoints = new int[epp.numberOfEntryPoints()]; + int idx = 0; + while (idx < entryPoints.length) { + int entryPointOrdInt = eps.nextDoc(); + if (entryPointOrdInt == NO_MORE_DOCS) { + throw new IllegalArgumentException( + "The number of entry points provided is less than the number of entry points requested"); + } + assert entryPointOrdInt < getGraphSize(graph); + entryPoints[idx++] = entryPointOrdInt; + } + // This is an invalid case, but we should check it + assert entryPoints.length > 0; + // We use provided entry point ordinals to search the complete graph (level 0) + graphSearcher.searchLevel(knnCollector, scorer, 0, entryPoints, graph, acceptOrds); + } else { + search(scorer, knnCollector, graph, graphSearcher, acceptOrds); + } } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java index d0d398be2a78..9540a972fb20 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java @@ -129,6 +129,7 @@ static List components( } Component component = markRooted(hnsw, level, connectedNodes, notFullyConnected, maxConn, nextClear); + assert component.start() == nextClear; assert component.size() > 0; components.add(component); total += component.size(); diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index ed1a5ffb59fa..5225fe700ab9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -24,54 +24,24 @@ /** * Wraps a provided KnnCollector object, translating the provided vectorId ordinal to a documentId */ -public final class OrdinalTranslatedKnnCollector implements KnnCollector { +public final class OrdinalTranslatedKnnCollector extends KnnCollector.Decorator { - private final KnnCollector in; private final IntToIntFunction vectorOrdinalToDocId; - public OrdinalTranslatedKnnCollector(KnnCollector in, IntToIntFunction vectorOrdinalToDocId) { - this.in = in; + public OrdinalTranslatedKnnCollector( + KnnCollector collector, IntToIntFunction vectorOrdinalToDocId) { + super(collector); this.vectorOrdinalToDocId = vectorOrdinalToDocId; } - @Override - public boolean earlyTerminated() { - return in.earlyTerminated(); - } - - @Override - public void incVisitedCount(int count) { - in.incVisitedCount(count); - } - - @Override - public long visitedCount() { - return in.visitedCount(); - } - - @Override - public long visitLimit() { - return in.visitLimit(); - } - - @Override - public int k() { - return in.k(); - } - @Override public boolean collect(int vectorId, float similarity) { - return in.collect(vectorOrdinalToDocId.apply(vectorId), similarity); - } - - @Override - public float minCompetitiveSimilarity() { - return in.minCompetitiveSimilarity(); + return super.collect(vectorOrdinalToDocId.apply(vectorId), similarity); } @Override public TopDocs topDocs() { - TopDocs td = in.topDocs(); + TopDocs td = super.topDocs(); return new TopDocs( new TotalHits( visitedCount(), diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java index 1369aa5e3f40..6b2d45d3e294 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java @@ -98,7 +98,7 @@ public float dotProduct(float[] a, float[] b) { int i = 0; float res = 0; - // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize + // if the array size is large (> 2x platform vector size), it's worth the overhead to vectorize if (a.length > 2 * FLOAT_SPECIES.length()) { i += FLOAT_SPECIES.loopBound(a.length); res += dotProductBody(a, b, i); @@ -161,7 +161,7 @@ public float cosine(float[] a, float[] b) { float norm1 = 0; float norm2 = 0; - // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize + // if the array size is large (> 2x platform vector size), it's worth the overhead to vectorize if (a.length > 2 * FLOAT_SPECIES.length()) { i += FLOAT_SPECIES.loopBound(a.length); float[] ret = cosineBody(a, b, i); @@ -226,7 +226,7 @@ public float squareDistance(float[] a, float[] b) { int i = 0; float res = 0; - // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize + // if the array size is large (> 2x platform vector size), it's worth the overhead to vectorize if (a.length > 2 * FLOAT_SPECIES.length()) { i += FLOAT_SPECIES.loopBound(a.length); res += squareDistanceBody(a, b, i); diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index 2424b53645bd..800a66a2167e 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -337,8 +337,6 @@ public void prefetch(long offset, long length) throws IOException { ensureOpen(); - Objects.checkFromIndexSize(offset, length, length()); - if (BitUtil.isZeroOrPowerOfTwo(consecutivePrefetchHitCount++) == false) { // We've had enough consecutive hits on the page cache that this number is neither zero nor a // power of two. There is a good chance that a good chunk of this index input is cached in @@ -381,8 +379,6 @@ void advise(long offset, long length, IOConsumer advice) throws I ensureOpen(); - Objects.checkFromIndexSize(offset, length, length()); - final NativeAccess nativeAccess = NATIVE_ACCESS.get(); try { @@ -601,7 +597,7 @@ public final MemorySegmentIndexInput clone() { */ @Override public final MemorySegmentIndexInput slice(String sliceDescription, long offset, long length) { - if (offset < 0 || length < 0 || offset + length > this.length) { + if ((length | offset) < 0 || length > this.length - offset) { throw new IllegalArgumentException( "slice() " + sliceDescription @@ -818,6 +814,12 @@ public MemorySegment segmentSliceOrNull(long pos, long len) throws IOException { throw handlePositionalIOOBE(e, "segmentSliceOrNull", pos); } } + + @Override + public void prefetch(long offset, long length) throws IOException { + Objects.checkFromIndexSize(offset, length, this.length); + super.prefetch(offset, length); + } } /** This class adds offset support to MemorySegmentIndexInput, which is needed for slices. */ @@ -903,5 +905,11 @@ public MemorySegment segmentSliceOrNull(long pos, long len) throws IOException { MemorySegmentIndexInput buildSlice(String sliceDescription, long ofs, long length) { return super.buildSlice(sliceDescription, this.offset + ofs, length); } + + @Override + public void prefetch(long offset, long length) throws IOException { + Objects.checkFromIndexSize(offset, length, this.length); + super.prefetch(this.offset + offset, length); + } } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java index 3e346f3eb206..d41ab472ea60 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java @@ -56,7 +56,9 @@ public void testEncodeDecode() throws IOException { for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { source[j] = values[i * ForUtil.BLOCK_SIZE + j]; } - forDeltaUtil.encodeDeltas(source, out); + int bitsPerValue = forDeltaUtil.bitsRequired(source); + out.writeByte((byte) bitsPerValue); + forDeltaUtil.encodeDeltas(bitsPerValue, source, out); } endPointer = out.getFilePointer(); out.close(); @@ -71,7 +73,8 @@ public void testEncodeDecode() throws IOException { for (int i = 0; i < iterations; ++i) { int base = 0; final int[] restored = new int[ForUtil.BLOCK_SIZE]; - forDeltaUtil.decodeAndPrefixSum(pdu, base, restored); + int bitsPerValue = pdu.in.readByte(); + forDeltaUtil.decodeAndPrefixSum(bitsPerValue, pdu, base, restored); final int[] expected = new int[ForUtil.BLOCK_SIZE]; for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { expected[j] = values[i * ForUtil.BLOCK_SIZE + j]; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormatV0.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormatV0.java new file mode 100644 index 000000000000..037527413ea8 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormatV0.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.tests.index.BasePostingsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; + +public class TestLucene101PostingsFormatV0 extends BasePostingsFormatTestCase { + + @Override + protected Codec getCodec() { + return TestUtil.alwaysPostingsFormat( + new Lucene101PostingsFormat( + Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, + Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE, + Lucene101PostingsFormat.VERSION_START)); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java index 33918c4d8dc8..bd0696f66d16 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.document; +import static org.hamcrest.Matchers.anyOf; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -480,7 +481,9 @@ public void testStoreTermVectors() throws Exception { FeatureField invalid = new FeatureField("features", "pagerank", 1, false); doc.add(invalid); var exc = expectThrows(Exception.class, () -> writer.addDocument(doc)); - assertThat(exc.getMessage(), containsString("store term vector")); + assertThat( + exc.getMessage(), + anyOf(containsString("store term vector"), containsString("storeTermVector"))); writer.forceMerge(1); DirectoryReader reader = writer.getReader(); diff --git a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java index 2023ee73391d..7db87e231ff0 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java @@ -43,7 +43,6 @@ public void testLargeSegment() throws Exception { 128)); // Make sure to use the ConfigurableMCodec instead of a random one iwc.setRAMBufferSizeMB(64); // Use a 64MB buffer to create larger initial segments TieredMergePolicy mp = new TieredMergePolicy(); - mp.setMaxMergeAtOnce(256); // avoid intermediate merges (waste of time with HNSW?) mp.setSegmentsPerTier(256); // only merge once at the end when we ask iwc.setMergePolicy(mp); String fieldName = "field"; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAtomicUpdate.java b/lucene/core/src/test/org/apache/lucene/index/TestAtomicUpdate.java index b951a565d623..5dc8327c7997 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAtomicUpdate.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAtomicUpdate.java @@ -109,7 +109,7 @@ public void runTest(Directory directory) throws Exception { IndexWriterConfig conf = new IndexWriterConfig(new MockAnalyzer(random())).setMaxBufferedDocs(7); - ((TieredMergePolicy) conf.getMergePolicy()).setMaxMergeAtOnce(3); + ((TieredMergePolicy) conf.getMergePolicy()).setSegmentsPerTier(3); IndexWriter writer = RandomIndexWriter.mockIndexWriter(directory, conf, random()); // Establish a base index of 100 docs: diff --git a/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java b/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java index e0b2c49d8548..0d52481b908e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java @@ -375,7 +375,6 @@ protected void doMerge(MergeSource mergeSource, MergePolicy.OneMerge merge) TieredMergePolicy tmp = new TieredMergePolicy(); iwc.setMergePolicy(tmp); - tmp.setMaxMergeAtOnce(2); tmp.setSegmentsPerTier(2); IndexWriter w = new IndexWriter(dir, iwc); @@ -418,7 +417,6 @@ protected void doMerge(MergeSource mergeSource, MergePolicy.OneMerge merge) dir.close(); } - @SuppressForbidden(reason = "Thread sleep") public void testIntraMergeThreadPoolIsLimitedByMaxThreads() throws IOException { ConcurrentMergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); MergeScheduler.MergeSource mergeSource = @@ -475,11 +473,12 @@ public void merge(MergePolicy.OneMerge merge) throws IOException { Executor executor = mergeScheduler.intraMergeExecutor; AtomicInteger threadsExecutedOnPool = new AtomicInteger(); AtomicInteger threadsExecutedOnSelf = new AtomicInteger(); - for (int i = 0; i < 4; i++) { + CountDownLatch latch = new CountDownLatch(1); + final int totalThreads = 4; + for (int i = 0; i < totalThreads; i++) { mergeScheduler.mergeThreads.add( mergeScheduler.new MergeThread(mergeSource, merge) { @Override - @SuppressForbidden(reason = "Thread sleep") public void run() { executor.execute( () -> { @@ -489,7 +488,7 @@ public void run() { threadsExecutedOnPool.incrementAndGet(); } try { - Thread.sleep(100); + latch.await(); } catch (InterruptedException e) { throw new RuntimeException(e); } @@ -500,6 +499,10 @@ public void run() { for (ConcurrentMergeScheduler.MergeThread thread : mergeScheduler.mergeThreads) { thread.start(); } + while (threadsExecutedOnSelf.get() + threadsExecutedOnPool.get() < totalThreads) { + Thread.yield(); + } + latch.countDown(); mergeScheduler.sync(); assertEquals(3, threadsExecutedOnSelf.get()); assertEquals(1, threadsExecutedOnPool.get()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java index d06330c29269..924700835403 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java @@ -36,7 +36,7 @@ public class TestForTooMuchCloning extends LuceneTestCase { public void test() throws Exception { final MockDirectoryWrapper dir = newMockDirectory(); final TieredMergePolicy tmp = new TieredMergePolicy(); - tmp.setMaxMergeAtOnce(2); + tmp.setSegmentsPerTier(2); final RandomIndexWriter w = new RandomIndexWriter( random(), diff --git a/lucene/core/src/test/org/apache/lucene/index/TestForceMergeForever.java b/lucene/core/src/test/org/apache/lucene/index/TestForceMergeForever.java index 78514b2c3ed5..a5cf2b67f488 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestForceMergeForever.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestForceMergeForever.java @@ -70,7 +70,7 @@ public void test() throws Exception { MergePolicy mp = w.getConfig().getMergePolicy(); final int mergeAtOnce = 1 + w.cloneSegmentInfos().size(); if (mp instanceof TieredMergePolicy) { - ((TieredMergePolicy) mp).setMaxMergeAtOnce(mergeAtOnce); + ((TieredMergePolicy) mp).setSegmentsPerTier(mergeAtOnce); } else if (mp instanceof LogMergePolicy) { ((LogMergePolicy) mp).setMergeFactor(mergeAtOnce); } else { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestLogMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestLogMergePolicy.java index ea60f9b1e090..8c121c1f6a05 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestLogMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestLogMergePolicy.java @@ -50,8 +50,13 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws @Override protected void assertMerge(MergePolicy policy, MergeSpecification merge) throws IOException { LogMergePolicy lmp = (LogMergePolicy) policy; + MergeContext mockMergeContext = new MockMergeContext(SegmentCommitInfo::getDelCount); for (OneMerge oneMerge : merge.merges) { - assertTrue(oneMerge.segments.size() <= lmp.getMergeFactor()); + long mergeSize = 0; + for (SegmentCommitInfo info : oneMerge.segments) { + mergeSize += lmp.size(info, mockMergeContext); + } + assertTrue(mergeSize < lmp.minMergeSize || oneMerge.segments.size() <= lmp.getMergeFactor()); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java index 8039d8b8f6fb..285296d55c19 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java @@ -25,6 +25,7 @@ import java.util.Collections; import java.util.List; import java.util.Locale; +import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.hnsw.HnswGraphProvider; import org.apache.lucene.document.BinaryDocValuesField; @@ -153,14 +154,16 @@ public void testSortOnAddIndicesRandom() throws IOException { docIds.add(i); } Collections.shuffle(docIds, random()); - // If true, index a vector for every doc - boolean denseVectors = random().nextBoolean(); + // If true, index a vector and points for every doc + boolean dense = random().nextBoolean(); try (RandomIndexWriter iw = new RandomIndexWriter(random(), dir)) { for (int i = 0; i < numDocs; i++) { int docId = docIds.get(i); Document doc = new Document(); doc.add(new StringField("string_id", Integer.toString(docId), Field.Store.YES)); - doc.add(new LongPoint("point_id", docId)); + if (dense || docId % 3 == 0) { + doc.add(new LongPoint("point_id", docId)); + } String s = RandomStrings.randomRealisticUnicodeOfLength(random(), 25); doc.add(new TextField("text_field", s, Field.Store.YES)); doc.add(new BinaryDocValuesField("text_field", new BytesRef(s))); @@ -172,7 +175,7 @@ public void testSortOnAddIndicesRandom() throws IOException { doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId)))); doc.add( new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId)))); - if (denseVectors || docId % 2 == 0) { + if (dense || docId % 2 == 0) { doc.add(new KnnFloatVectorField("vector", new float[] {(float) docId})); } doc.add(new NumericDocValuesField("foo", random().nextInt(20))); @@ -245,8 +248,13 @@ public void testSortOnAddIndicesRandom() throws IOException { SortedSetDocValues sorted_set_dv = leaf.getSortedSetDocValues("sorted_set_dv"); SortedDocValues binary_sorted_dv = leaf.getSortedDocValues("binary_sorted_dv"); FloatVectorValues vectorValues = leaf.getFloatVectorValues("vector"); - HnswGraph graph = - ((HnswGraphProvider) ((CodecReader) leaf).getVectorReader()).getGraph("vector"); + KnnVectorsReader vectorsReader = ((CodecReader) leaf).getVectorReader(); + HnswGraph graph; + if (vectorsReader instanceof HnswGraphProvider hnswGraphProvider) { + graph = hnswGraphProvider.getGraph("vector"); + } else { + graph = null; + } NumericDocValues ids = leaf.getNumericDocValues("id"); long prevValue = -1; boolean usingAltIds = false; @@ -272,10 +280,12 @@ public void testSortOnAddIndicesRandom() throws IOException { assertTrue(sorted_numeric_dv.advanceExact(idNext)); assertTrue(sorted_set_dv.advanceExact(idNext)); assertTrue(binary_sorted_dv.advanceExact(idNext)); - if (denseVectors || prevValue % 2 == 0) { + if (dense || prevValue % 2 == 0) { assertEquals(idNext, valuesIterator.advance(idNext)); - graph.seek(0, valuesIterator.index()); - assertNotEquals(DocIdSetIterator.NO_MORE_DOCS, graph.nextNeighbor()); + if (graph != null) { + graph.seek(0, valuesIterator.index()); + assertNotEquals(DocIdSetIterator.NO_MORE_DOCS, graph.nextNeighbor()); + } } assertEquals(new BytesRef(ids.longValue() + ""), binary_dv.binaryValue()); @@ -289,7 +299,7 @@ public void testSortOnAddIndicesRandom() throws IOException { assertEquals(1, sorted_numeric_dv.docValueCount()); assertEquals(ids.longValue(), sorted_numeric_dv.nextValue()); - if (denseVectors || prevValue % 2 == 0) { + if (dense || prevValue % 2 == 0) { float[] vectorValue = vectorValues.vectorValue(valuesIterator.index()); assertEquals(1, vectorValue.length); assertEquals((float) ids.longValue(), vectorValue[0], 0.001f); @@ -306,9 +316,13 @@ public void testSortOnAddIndicesRandom() throws IOException { leaf.storedFields().document(idNext).get("string_id")); IndexSearcher searcher = new IndexSearcher(r); TopDocs result = - searcher.search(LongPoint.newExactQuery("point_id", ids.longValue()), 1); - assertEquals(1, result.totalHits.value()); - assertEquals(idNext, result.scoreDocs[0].doc); + searcher.search(LongPoint.newExactQuery("point_id", ids.longValue()), 10); + if (dense || ids.longValue() % 3 == 0) { + assertEquals(1, result.totalHits.value()); + assertEquals(idNext, result.scoreDocs[0].doc); + } else { + assertEquals(0, result.totalHits.value()); + } result = searcher.search(new TermQuery(new Term("string_id", "" + ids.longValue())), 1); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java index 58e33cb7648e..6f11f71f3a99 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java @@ -104,7 +104,7 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws // below we make the assumption that segments that reached the max segment // size divided by 2 don't need merging anymore - int mergeFactor = (int) Math.min(tmp.getSegmentsPerTier(), tmp.getMaxMergeAtOnce()); + int mergeFactor = (int) tmp.getSegmentsPerTier(); while (true) { final double segCountLevel = bytesLeft / (double) levelSizeBytes; if (segCountLevel <= tmp.getSegmentsPerTier() @@ -145,12 +145,11 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws assertTrue( String.format( Locale.ROOT, - "mergeFactor=%d minSegmentBytes=%,d maxMergedSegmentBytes=%,d segmentsPerTier=%g maxMergeAtOnce=%d numSegments=%d allowed=%g totalBytes=%,d delPercentage=%g deletesPctAllowed=%g targetNumSegments=%d", + "mergeFactor=%d minSegmentBytes=%,d maxMergedSegmentBytes=%,d segmentsPerTier=%g numSegments=%d allowed=%g totalBytes=%,d delPercentage=%g deletesPctAllowed=%g targetNumSegments=%d", mergeFactor, minSegmentBytes, maxMergedSegmentBytes, tmp.getSegmentsPerTier(), - tmp.getMaxMergeAtOnce(), numSegments, allowedSegCount, totalBytes, @@ -162,10 +161,7 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws @Override protected void assertMerge(MergePolicy policy, MergeSpecification merges) { - TieredMergePolicy tmp = (TieredMergePolicy) policy; - for (OneMerge merge : merges.merges) { - assertTrue(merge.segments.size() <= tmp.getMaxMergeAtOnce()); - } + // anything to assert? } public void testForceMergeDeletes() throws Exception { @@ -174,7 +170,6 @@ public void testForceMergeDeletes() throws Exception { TieredMergePolicy tmp = newTieredMergePolicy(); conf.setMergePolicy(tmp); conf.setMaxBufferedDocs(4); - tmp.setMaxMergeAtOnce(100); tmp.setSegmentsPerTier(100); tmp.setDeletesPctAllowed(50.0); tmp.setForceMergeDeletesPctAllowed(30.0); @@ -219,8 +214,8 @@ public void testPartialMerge() throws Exception { TieredMergePolicy tmp = newTieredMergePolicy(); conf.setMergePolicy(tmp); conf.setMaxBufferedDocs(2); - tmp.setMaxMergeAtOnce(3); tmp.setSegmentsPerTier(6); + tmp.setFloorSegmentMB(Double.MIN_VALUE); IndexWriter w = new IndexWriter(dir, conf); int maxCount = 0; @@ -231,7 +226,7 @@ public void testPartialMerge() throws Exception { w.addDocument(doc); int count = w.getSegmentCount(); maxCount = Math.max(count, maxCount); - assertTrue("count=" + count + " maxCount=" + maxCount, count >= maxCount - 3); + assertTrue("count=" + count + " maxCount=" + maxCount, count >= maxCount - 6); } w.flush(true, true); @@ -973,15 +968,13 @@ public void testMergeSizeIsLessThanFloorSize() throws IOException { assertEquals(15, oneMerge.segments.size()); } - // Segments are below the floor segment size and we'd need to merge more than maxMergeAtOnce - // segments to go above the minimum segment size. We get 1 merge of maxMergeAtOnce=30 segments - // and 1 merge of 50-30=20 segments. + // Segments are below the floor segment size. We get one merge that merges the 50 segments + // together. mergePolicy.setFloorSegmentMB(60); mergeSpec = mergePolicy.findMerges(MergeTrigger.FULL_FLUSH, infos, mergeContext); assertNotNull(mergeSpec); - assertEquals(2, mergeSpec.merges.size()); - assertEquals(30, mergeSpec.merges.get(0).segments.size()); - assertEquals(20, mergeSpec.merges.get(1).segments.size()); + assertEquals(1, mergeSpec.merges.size()); + assertEquals(50, mergeSpec.merges.get(0).segments.size()); } public void testFullFlushMerges() throws IOException { @@ -1008,6 +1001,6 @@ public void testFullFlushMerges() throws IOException { segmentInfos = applyMerge(segmentInfos, merge, "_" + segNameGenerator.getAndIncrement(), stats); } - assertEquals(2, segmentInfos.size()); + assertEquals(1, segmentInfos.size()); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java index 8a0d3b65aea9..49a35b75f151 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.util.HashSet; +import java.util.Random; import java.util.Set; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.document.Document; @@ -40,7 +41,9 @@ import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.QueryTimeout; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorEncoding; @@ -481,6 +484,62 @@ public void testSkewedIndex() throws IOException { } } + /** Tests with random vectors, number of documents, etc. Uses RandomIndexWriter. */ + public void testRandomConsistencySingleThreaded() throws IOException { + assertRandomConsistency(false); + } + + @AwaitsFix(bugUrl = "https://github.com/apache/lucene/issues/14180") + public void testRandomConsistencyMultiThreaded() throws IOException { + assertRandomConsistency(true); + } + + private void assertRandomConsistency(boolean multiThreaded) throws IOException { + int numDocs = 100; + int dimension = 4; + int numIters = 10; + boolean everyDocHasAVector = random().nextBoolean(); + Random r = random(); + try (Directory d = newDirectoryForTest()) { + // To ensure consistency between seeded runs, remove some randomness + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setMergeScheduler(new SerialMergeScheduler()); + iwc.setMergePolicy(NoMergePolicy.INSTANCE); + iwc.setMaxBufferedDocs(numDocs); + iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH); + try (IndexWriter w = new IndexWriter(d, iwc)) { + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (everyDocHasAVector || random().nextInt(10) != 2) { + doc.add(getKnnVectorField("field", randomVector(dimension))); + } + w.addDocument(doc); + if (r.nextBoolean() && i % 50 == 0) { + w.flush(); + } + } + } + try (IndexReader reader = DirectoryReader.open(d)) { + IndexSearcher searcher = newSearcher(reader, true, true, multiThreaded); + // first get the initial set of docs, and we expect all future queries to be exactly the + // same + int k = random().nextInt(80) + 1; + AbstractKnnVectorQuery query = getKnnVectorQuery("field", randomVector(dimension), k); + int n = random().nextInt(100) + 1; + TopDocs expectedResults = searcher.search(query, n); + for (int i = 0; i < numIters; i++) { + TopDocs results = searcher.search(query, n); + assertEquals(expectedResults.totalHits.value(), results.totalHits.value()); + assertEquals(expectedResults.scoreDocs.length, results.scoreDocs.length); + for (int j = 0; j < results.scoreDocs.length; j++) { + assertEquals(expectedResults.scoreDocs[j].doc, results.scoreDocs[j].doc); + assertEquals(expectedResults.scoreDocs[j].score, results.scoreDocs[j].score, EPSILON); + } + } + } + } + } + /** Tests with random vectors, number of documents, etc. Uses RandomIndexWriter. */ public void testRandom() throws IOException { int numDocs = atLeast(100); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java b/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java index fb7afac8ba47..967c5a34d7dc 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java @@ -26,6 +26,42 @@ public class TestDisiPriorityQueue extends LuceneTestCase { + public void testDisiPriorityQueue2() throws IOException { + Random r = random(); + DisiWrapper w1 = wrapper(randomDisi(r)); + DisiWrapper w2 = wrapper(randomDisi(r)); + DisiWrapper w3 = wrapper(randomDisi(r)); + + DisiPriorityQueue pq = DisiPriorityQueue.ofMaxSize(2); + w1.doc = 1; + w2.doc = 0; + assertNull(pq.top()); + assertEquals(0, pq.size()); + assertSame(w1, pq.add(w1)); + assertSame(w1, pq.top()); + assertEquals(1, pq.size()); + assertSame(w2, pq.add(w2)); + assertSame(w2, pq.top()); + assertEquals(2, pq.size()); + expectThrows(IllegalStateException.class, () -> pq.add(w3)); + + w2.doc = 1; + assertSame(w2, pq.updateTop()); + DisiWrapper topList = pq.topList(); + assertSame(w1, topList); + assertSame(w2, topList.next); + assertNull(topList.next.next); + + w2.doc = 2; + assertSame(w1, pq.updateTop()); + topList = pq.topList(); + assertSame(w1, topList); + assertNull(topList.next); + + assertSame(w1, pq.pop()); + assertSame(w2, pq.top()); + } + public void testRandom() throws Exception { Random r = random(); @@ -37,7 +73,7 @@ public void testRandom() throws Exception { all[i] = w; } - DisiPriorityQueue pq = new DisiPriorityQueue(size); + DisiPriorityQueue pq = DisiPriorityQueue.ofMaxSize(size); if (r.nextBoolean()) { for (DisiWrapper w : all) { pq.add(w); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java index b45d6e8fb641..21219e0e1d99 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java @@ -61,7 +61,7 @@ Field getKnnVectorField(String name, float[] vector) { return new KnnByteVectorField(name, floatToBytes(vector), VectorSimilarityFunction.EUCLIDEAN); } - private static byte[] floatToBytes(float[] query) { + static byte[] floatToBytes(float[] query) { byte[] bytes = new byte[query.length]; for (int i = 0; i < query.length; i++) { assert query[i] <= Byte.MAX_VALUE && query[i] >= Byte.MIN_VALUE && (query[i] % 1) == 0 @@ -109,7 +109,7 @@ public void testVectorEncodingMismatch() throws IOException { } } - private static class ThrowingKnnVectorQuery extends KnnByteVectorQuery { + static class ThrowingKnnVectorQuery extends KnnByteVectorQuery { public ThrowingKnnVectorQuery(String field, byte[] target, int k, Query filter) { super(field, target, k, filter); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java index 5dcb6f97df93..ece2b385654e 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java @@ -259,7 +259,7 @@ public void testDocAndScoreQueryBasics() throws IOException { } } - private static class ThrowingKnnVectorQuery extends KnnFloatVectorQuery { + static class ThrowingKnnVectorQuery extends KnnFloatVectorQuery { public ThrowingKnnVectorQuery(String field, float[] target, int k, Query filter) { super(field, target, k, filter); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnByteVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnByteVectorQuery.java new file mode 100644 index 000000000000..d0fb8c95e035 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnByteVectorQuery.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import static org.apache.lucene.search.TestKnnByteVectorQuery.floatToBytes; + +import java.io.IOException; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.KnnByteVectorField; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.QueryTimeout; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.TestVectorUtil; + +public class TestSeededKnnByteVectorQuery extends BaseKnnVectorQueryTestCase { + + private static final Query MATCH_NONE = new MatchNoDocsQuery(); + + @Override + AbstractKnnVectorQuery getKnnVectorQuery(String field, float[] query, int k, Query queryFilter) { + return new SeededKnnByteVectorQuery(field, floatToBytes(query), k, queryFilter, MATCH_NONE); + } + + @Override + AbstractKnnVectorQuery getThrowingKnnVectorQuery(String field, float[] vec, int k, Query query) { + return new ThrowingKnnVectorQuery(field, floatToBytes(vec), k, query, MATCH_NONE); + } + + @Override + float[] randomVector(int dim) { + byte[] b = TestVectorUtil.randomVectorBytes(dim); + float[] v = new float[b.length]; + int vi = 0; + for (int i = 0; i < v.length; i++) { + v[vi++] = b[i]; + } + return v; + } + + @Override + Field getKnnVectorField( + String name, float[] vector, VectorSimilarityFunction similarityFunction) { + return new KnnByteVectorField(name, floatToBytes(vector), similarityFunction); + } + + @Override + Field getKnnVectorField(String name, float[] vector) { + return new KnnByteVectorField(name, floatToBytes(vector), VectorSimilarityFunction.EUCLIDEAN); + } + + /** Tests with random vectors and a random seed. Uses RandomIndexWriter. */ + public void testRandomWithSeed() throws IOException { + int numDocs = 1000; + int dimension = atLeast(5); + int numIters = atLeast(10); + int numDocsWithVector = 0; + try (Directory d = newDirectoryForTest()) { + // Always use the default kNN format to have predictable behavior around when it hits + // visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN + // format + // implementation. + IndexWriterConfig iwc = new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()); + RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { + // Randomly skip some vectors to test the mapping from docid to ordinals + doc.add(getKnnVectorField("field", randomVector(dimension))); + numDocsWithVector += 1; + } + doc.add(new NumericDocValuesField("tag", i)); + doc.add(new IntPoint("tag", i)); + w.addDocument(doc); + } + w.forceMerge(1); + w.close(); + + try (IndexReader reader = DirectoryReader.open(d)) { + IndexSearcher searcher = newSearcher(reader); + for (int i = 0; i < numIters; i++) { + int k = random().nextInt(80) + 1; + int n = random().nextInt(100) + 1; + // we may get fewer results than requested if there are deletions, but this test doesn't + // check that + assert reader.hasDeletions() == false; + + // All documents as seeds + Query seed1 = new MatchAllDocsQuery(); + Query filter = random().nextBoolean() ? null : new MatchAllDocsQuery(); + SeededKnnByteVectorQuery query = + new SeededKnnByteVectorQuery( + "field", floatToBytes(randomVector(dimension)), k, filter, seed1); + TopDocs results = searcher.search(query, n); + int expected = Math.min(Math.min(n, k), numDocsWithVector); + + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + float last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + + // Restrictive seed query -- 6 documents + Query seed2 = IntPoint.newRangeQuery("tag", 1, 6); + query = + new SeededKnnByteVectorQuery( + "field", floatToBytes(randomVector(dimension)), k, null, seed2); + results = searcher.search(query, n); + expected = Math.min(Math.min(n, k), reader.numDocs()); + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + + // No seed documents -- falls back on full approx search + Query seed3 = new MatchNoDocsQuery(); + query = + new SeededKnnByteVectorQuery( + "field", floatToBytes(randomVector(dimension)), k, null, seed3); + results = searcher.search(query, n); + expected = Math.min(Math.min(n, k), reader.numDocs()); + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + } + } + } + } + + private static class ThrowingKnnVectorQuery extends SeededKnnByteVectorQuery { + + public ThrowingKnnVectorQuery(String field, byte[] target, int k, Query filter, Query seed) { + super(field, target, k, filter, seed); + } + + private ThrowingKnnVectorQuery( + String field, byte[] target, int k, Query filter, Weight seedWeight) { + super(field, target, k, filter, seedWeight); + } + + @Override + // This is test only and we need to overwrite the inner rewrite to throw + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + if (seedWeight != null) { + return super.rewrite(indexSearcher); + } + BooleanQuery.Builder booleanSeedQueryBuilder = + new BooleanQuery.Builder() + .add(seed, BooleanClause.Occur.MUST) + .add(new FieldExistsQuery(field), BooleanClause.Occur.FILTER); + if (filter != null) { + booleanSeedQueryBuilder.add(filter, BooleanClause.Occur.FILTER); + } + Query seedRewritten = indexSearcher.rewrite(booleanSeedQueryBuilder.build()); + Weight seedWeight = indexSearcher.createWeight(seedRewritten, ScoreMode.TOP_SCORES, 1f); + return new ThrowingKnnVectorQuery(field, target, k, filter, seedWeight) + .rewrite(indexSearcher); + } + + @Override + protected TopDocs exactSearch( + LeafReaderContext context, DocIdSetIterator acceptIterator, QueryTimeout queryTimeout) { + throw new UnsupportedOperationException("exact search is not supported"); + } + + @Override + public String toString(String field) { + return null; + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnFloatVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnFloatVectorQuery.java new file mode 100644 index 000000000000..d5630037ef74 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnFloatVectorQuery.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.QueryTimeout; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.TestVectorUtil; + +public class TestSeededKnnFloatVectorQuery extends BaseKnnVectorQueryTestCase { + private static final Query MATCH_NONE = new MatchNoDocsQuery(); + + @Override + KnnFloatVectorQuery getKnnVectorQuery(String field, float[] query, int k, Query queryFilter) { + return new SeededKnnFloatVectorQuery(field, query, k, queryFilter, MATCH_NONE); + } + + @Override + AbstractKnnVectorQuery getThrowingKnnVectorQuery(String field, float[] vec, int k, Query query) { + return new ThrowingKnnVectorQuery(field, vec, k, query, MATCH_NONE); + } + + @Override + float[] randomVector(int dim) { + return TestVectorUtil.randomVector(dim); + } + + @Override + Field getKnnVectorField( + String name, float[] vector, VectorSimilarityFunction similarityFunction) { + return new KnnFloatVectorField(name, vector, similarityFunction); + } + + @Override + Field getKnnVectorField(String name, float[] vector) { + return new KnnFloatVectorField(name, vector); + } + + /** Tests with random vectors and a random seed. Uses RandomIndexWriter. */ + public void testRandomWithSeed() throws IOException { + int numDocs = 1000; + int dimension = atLeast(5); + int numIters = atLeast(10); + int numDocsWithVector = 0; + try (Directory d = newDirectoryForTest()) { + // Always use the default kNN format to have predictable behavior around when it hits + // visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN + // format + // implementation. + IndexWriterConfig iwc = new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()); + RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { + // Randomly skip some vectors to test the mapping from docid to ordinals + doc.add(getKnnVectorField("field", randomVector(dimension))); + numDocsWithVector += 1; + } + doc.add(new NumericDocValuesField("tag", i)); + doc.add(new IntPoint("tag", i)); + w.addDocument(doc); + } + w.forceMerge(1); + w.close(); + + try (IndexReader reader = DirectoryReader.open(d)) { + IndexSearcher searcher = newSearcher(reader); + for (int i = 0; i < numIters; i++) { + int k = random().nextInt(80) + 1; + int n = random().nextInt(100) + 1; + // we may get fewer results than requested if there are deletions, but this test doesn't + // check that + assert reader.hasDeletions() == false; + + // All documents as seeds + Query seed1 = new MatchAllDocsQuery(); + Query filter = random().nextBoolean() ? null : new MatchAllDocsQuery(); + AbstractKnnVectorQuery query = + new SeededKnnFloatVectorQuery("field", randomVector(dimension), k, filter, seed1); + TopDocs results = searcher.search(query, n); + int expected = Math.min(Math.min(n, k), numDocsWithVector); + + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + float last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + + // Restrictive seed query -- 6 documents + Query seed2 = IntPoint.newRangeQuery("tag", 1, 6); + query = new SeededKnnFloatVectorQuery("field", randomVector(dimension), k, null, seed2); + results = searcher.search(query, n); + expected = Math.min(Math.min(n, k), reader.numDocs()); + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + + // No seed documents -- falls back on full approx search + Query seed3 = new MatchNoDocsQuery(); + query = new SeededKnnFloatVectorQuery("field", randomVector(dimension), k, null, seed3); + results = searcher.search(query, n); + expected = Math.min(Math.min(n, k), reader.numDocs()); + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + } + } + } + } + + private static class ThrowingKnnVectorQuery extends SeededKnnFloatVectorQuery { + + private ThrowingKnnVectorQuery(String field, float[] target, int k, Query filter, Query seed) { + super(field, target, k, filter, seed); + } + + private ThrowingKnnVectorQuery( + String field, float[] target, int k, Query filter, Weight seedWeight) { + super(field, target, k, filter, seedWeight); + } + + @Override + // This is test only and we need to overwrite the inner rewrite to throw + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + if (seedWeight != null) { + return super.rewrite(indexSearcher); + } + BooleanQuery.Builder booleanSeedQueryBuilder = + new BooleanQuery.Builder() + .add(seed, BooleanClause.Occur.MUST) + .add(new FieldExistsQuery(field), BooleanClause.Occur.FILTER); + if (filter != null) { + booleanSeedQueryBuilder.add(filter, BooleanClause.Occur.FILTER); + } + Query seedRewritten = indexSearcher.rewrite(booleanSeedQueryBuilder.build()); + Weight seedWeight = indexSearcher.createWeight(seedRewritten, ScoreMode.TOP_SCORES, 1f); + return new ThrowingKnnVectorQuery(field, target, k, filter, seedWeight) + .rewrite(indexSearcher); + } + + @Override + protected TopDocs exactSearch( + LeafReaderContext context, DocIdSetIterator acceptIterator, QueryTimeout queryTimeout) { + throw new UnsupportedOperationException("exact search is not supported"); + } + + @Override + public String toString(String field) { + return null; + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/store/TestMMapDirectory.java b/lucene/core/src/test/org/apache/lucene/store/TestMMapDirectory.java index d01d6ec50ebb..f69befca850c 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestMMapDirectory.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestMMapDirectory.java @@ -329,4 +329,42 @@ public void testNoGroupingFunc() { assertFalse(func.apply("segment.si").isPresent()); assertFalse(func.apply("_51a.si").isPresent()); } + + public void testPrefetchWithSingleSegment() throws IOException { + testPrefetchWithSegments(64 * 1024); + } + + public void testPrefetchWithMultiSegment() throws IOException { + testPrefetchWithSegments(16 * 1024); + } + + static final Class IOOBE = IndexOutOfBoundsException.class; + + // does not verify that the actual segment is prefetched, but rather exercises the code and bounds + void testPrefetchWithSegments(int maxChunkSize) throws IOException { + byte[] bytes = new byte[(maxChunkSize * 2) + 1]; + try (Directory dir = + new MMapDirectory(createTempDir("testPrefetchWithSegments"), maxChunkSize)) { + try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) { + out.writeBytes(bytes, 0, bytes.length); + } + + try (var in = dir.openInput("test", IOContext.READONCE)) { + in.prefetch(0, in.length()); + expectThrows(IOOBE, () -> in.prefetch(1, in.length())); + expectThrows(IOOBE, () -> in.prefetch(in.length(), 1)); + + var slice1 = in.slice("slice-1", 1, in.length() - 1); + slice1.prefetch(0, slice1.length()); + expectThrows(IOOBE, () -> slice1.prefetch(1, slice1.length())); + expectThrows(IOOBE, () -> slice1.prefetch(slice1.length(), 1)); + + // we sliced off all but one byte from the first complete memory segment + var slice2 = in.slice("slice-2", maxChunkSize - 1, in.length() - maxChunkSize + 1); + slice2.prefetch(0, slice2.length()); + expectThrows(IOOBE, () -> slice2.prefetch(1, slice2.length())); + expectThrows(IOOBE, () -> slice2.prefetch(slice2.length(), 1)); + } + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java index 88dbf24e2d13..1d9079a203fd 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java @@ -130,13 +130,20 @@ public void testRandom() throws IOException { for (j = 0; j < array.length; ) { final int l = TestUtil.nextInt(random(), 1, array.length - j); DocIdSetBuilder.BulkAdder adder = null; - for (int k = 0, budget = 0; k < l; ++k) { - if (budget == 0 || rarely()) { - budget = TestUtil.nextInt(random(), 1, l - k + 5); - adder = builder.grow(budget); + if (usually()) { + for (int k = 0, budget = 0; k < l; ++k) { + if (budget == 0 || rarely()) { + budget = TestUtil.nextInt(random(), 1, l - k + 5); + adder = builder.grow(budget); + } + adder.add(array[j++]); + budget--; } - adder.add(array[j++]); - budget--; + } else { + IntsRef intsRef = new IntsRef(array, j, l); + adder = builder.grow(l); + adder.add(intsRef); + j += l; } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java index b19e17f897c2..39acd5ea209e 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java @@ -17,9 +17,7 @@ package org.apache.lucene.util; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; import java.util.Random; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.tests.util.BaseBitSetTestCase; @@ -646,40 +644,72 @@ public void testScanIsEmpty() { } public void testOrRange() { - FixedBitSet set1 = new FixedBitSet(1_000); - FixedBitSet set2 = new FixedBitSet(10_000); - for (int i = 0; i < set2.length(); i += 3) { - set2.set(i); + FixedBitSet dest = new FixedBitSet(1_000); + FixedBitSet source = new FixedBitSet(10_000); + for (int i = 0; i < source.length(); i += 3) { + source.set(i); } - // Check different values of `offset` - List offsets = new ArrayList<>(); - for (int offset = 64; offset < 128; ++offset) { - // Test all possible alignments - offsets.add(offset); + // Test all possible alignments, and both a "short" (less than 64) and a long length. + for (int sourceFrom = 64; sourceFrom < 128; ++sourceFrom) { + for (int destFrom = 256; destFrom < 320; ++destFrom) { + for (int length : + new int[] { + 0, + TestUtil.nextInt(random(), 1, Long.SIZE - 1), + TestUtil.nextInt(random(), Long.SIZE, 512) + }) { + dest.clear(); + for (int i = 0; i < dest.length(); i += 10) { + dest.set(i); + } + FixedBitSet.orRange(source, sourceFrom, dest, destFrom, length); + for (int i = 0; i < dest.length(); ++i) { + boolean destSet = i % 10 == 0; + if (i < destFrom || i >= destFrom + length) { + // Outside of the range, unmodified + assertEquals("" + i, destSet, dest.get(i)); + } else { + boolean sourceSet = source.get(sourceFrom + (i - destFrom)); + assertEquals(sourceSet || destSet, dest.get(i)); + } + } + } + } } - for (int offset = set2.length() - 128; offset < set2.length() - 64; ++offset) { - // Again, test all possible alignments, but this time we stop or-ing bits when exceeding the - // size of set2 rather than set1 - offsets.add(offset); + } + + public void testAndRange() { + FixedBitSet dest = new FixedBitSet(1_000); + FixedBitSet source = new FixedBitSet(10_000); + for (int i = 0; i < source.length(); i += 3) { + source.set(i); } - for (int offset : offsets) { - set1.clear(); - for (int i = 0; i < set1.length(); i += 10) { - set1.set(i); - } - set1.orRange(set2, offset); - int upTo = Math.min(set1.length(), set2.length() - offset); - for (int i = 0; i < set1.length(); ++i) { - if (i % 10 == 0 || i >= upTo) { - // These bits were set before, they should still be set - assertEquals(i % 10 == 0, set1.get(i)); - } else if ((offset + i) % 3 == 0) { - // These bits were set in set1, should be set in set2 - assertTrue(set1.get(i)); - } else { - assertFalse(set1.get(i)); + // Test all possible alignments, and both a "short" (less than 64) and a long length. + for (int sourceFrom = 64; sourceFrom < 128; ++sourceFrom) { + for (int destFrom = 256; destFrom < 320; ++destFrom) { + for (int length : + new int[] { + 0, + TestUtil.nextInt(random(), 1, Long.SIZE - 1), + TestUtil.nextInt(random(), Long.SIZE, 512) + }) { + dest.clear(); + for (int i = 0; i < dest.length(); i += 2) { + dest.set(i); + } + FixedBitSet.andRange(source, sourceFrom, dest, destFrom, length); + for (int i = 0; i < dest.length(); ++i) { + boolean destSet = i % 2 == 0; + if (i < destFrom || i >= destFrom + length) { + // Outside of the range, unmodified + assertEquals("" + i, destSet, dest.get(i)); + } else { + boolean sourceSet = source.get(sourceFrom + (i - destFrom)); + assertEquals("" + i, sourceSet && destSet, dest.get(i)); + } + } } } } diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java index e87f708c8d22..b114070ba9c0 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java @@ -986,7 +986,7 @@ public void testPackedLongValues() { new long[RandomNumbers.randomIntBetween(random(), 1, TEST_NIGHTLY ? 1000000 : 10000)]; float[] ratioOptions = new float[] {PackedInts.DEFAULT, PackedInts.COMPACT, PackedInts.FAST}; for (int bpv : new int[] {0, 1, 63, 64, RandomNumbers.randomIntBetween(random(), 2, 62)}) { - for (DataType dataType : Arrays.asList(DataType.DELTA_PACKED)) { + for (DataType dataType : DataType.values()) { final int pageSize = 1 << TestUtil.nextInt(random(), 6, 20); float acceptableOverheadRatio = ratioOptions[TestUtil.nextInt(random(), 0, ratioOptions.length - 1)]; diff --git a/lucene/licenses/commons-LICENSE-ASL.txt b/lucene/licenses/commons-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/commons-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/commons-NOTICE.txt b/lucene/licenses/commons-NOTICE.txt new file mode 100644 index 000000000000..554991d39bcf --- /dev/null +++ b/lucene/licenses/commons-NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/licenses/commons-codec-1.13.jar.sha1 b/lucene/licenses/commons-codec-1.13.jar.sha1 deleted file mode 100644 index 4d9344b4a4e6..000000000000 --- a/lucene/licenses/commons-codec-1.13.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -3f18e1aa31031d89db6f01ba05d501258ce69d2c diff --git a/lucene/licenses/commons-codec-1.17.2.jar.sha1 b/lucene/licenses/commons-codec-1.17.2.jar.sha1 new file mode 100644 index 000000000000..3ef561c0262f --- /dev/null +++ b/lucene/licenses/commons-codec-1.17.2.jar.sha1 @@ -0,0 +1 @@ +cd6bb9d856db5f61871a94d5801efd0b93b7fcb2 diff --git a/lucene/licenses/commons-lang3-3.17.0.jar.sha1 b/lucene/licenses/commons-lang3-3.17.0.jar.sha1 new file mode 100644 index 000000000000..f64174593b1c --- /dev/null +++ b/lucene/licenses/commons-lang3-3.17.0.jar.sha1 @@ -0,0 +1 @@ +b17d2136f0460dcc0d2016ceefca8723bdf4ee70 diff --git a/lucene/licenses/cuvs-java-25.02.0.jar.sha1 b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 new file mode 100644 index 000000000000..ccb02e86aa8c --- /dev/null +++ b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 @@ -0,0 +1 @@ +0086126edbd145e5d0be65e6157e96e3e8a2ebca diff --git a/lucene/licenses/cuvs-java-LICENSE-ASL.txt b/lucene/licenses/cuvs-java-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/cuvs-java-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/cuvs-java-NOTICE.txt b/lucene/licenses/cuvs-java-NOTICE.txt new file mode 100644 index 000000000000..554991d39bcf --- /dev/null +++ b/lucene/licenses/cuvs-java-NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/licenses/opennlp-tools-2.3.2.jar.sha1 b/lucene/licenses/opennlp-tools-2.3.2.jar.sha1 deleted file mode 100644 index 94b2924f8fa7..000000000000 --- a/lucene/licenses/opennlp-tools-2.3.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -d739edba1e729691ed5ab80e1ccf330555a02ea7 diff --git a/lucene/licenses/opennlp-tools-2.5.3.jar.sha1 b/lucene/licenses/opennlp-tools-2.5.3.jar.sha1 new file mode 100644 index 000000000000..fb01299fa29d --- /dev/null +++ b/lucene/licenses/opennlp-tools-2.5.3.jar.sha1 @@ -0,0 +1 @@ +4b544138ec079c1c73dc2c1b928506871c4b1b47 diff --git a/lucene/licenses/slf4j-api-1.7.36.jar.sha1 b/lucene/licenses/slf4j-api-1.7.36.jar.sha1 deleted file mode 100644 index 828b7cf7e056..000000000000 --- a/lucene/licenses/slf4j-api-1.7.36.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -6c62681a2f655b49963a5983b8b0950a6120ae14 diff --git a/lucene/licenses/slf4j-api-2.0.16.jar.sha1 b/lucene/licenses/slf4j-api-2.0.16.jar.sha1 new file mode 100644 index 000000000000..b1bb75be39b1 --- /dev/null +++ b/lucene/licenses/slf4j-api-2.0.16.jar.sha1 @@ -0,0 +1 @@ +0172931663a09a1fa515567af5fbef00897d3c04 diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java index 7facf48580c8..246109ede04c 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java @@ -311,10 +311,8 @@ private int shuffle( depth, vectorScore) .compute(); - - float scale = - VectorUtil.dotProduct(leftCentroid, leftCentroid) - + VectorUtil.dotProduct(rightCentroid, rightCentroid); + vectorSubtract(leftCentroid, rightCentroid, scratch); + float scale = (float) Math.sqrt(VectorUtil.dotProduct(scratch, scratch)); float maxLeftBias = Float.NEGATIVE_INFINITY; for (int i = ids.offset; i < midPoint; ++i) { maxLeftBias = Math.max(maxLeftBias, biases[i]); diff --git a/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java b/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java index ff7ea2341acd..b56a206e60b1 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.misc.store; +import static java.nio.ByteOrder.LITTLE_ENDIAN; + import java.io.EOFException; import java.io.IOException; import java.io.UncheckedIOException; @@ -26,6 +28,7 @@ import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.Arrays; +import java.util.Objects; import java.util.OptionalLong; import java.util.zip.CRC32; import java.util.zip.Checksum; @@ -146,7 +149,7 @@ protected void ensureOpen() throws AlreadyClosedException { /** * Determines if direct IO should be used for a file. By default this tests if it is a merge - * context and if the merge or file length extends the minimum size (see {@link + * context and if the merge or file length exceeds the minimum size (see {@link * #DEFAULT_MIN_BYTES_DIRECT}). Subclasses may override method to enforce direct IO for specific * file types. * @@ -211,8 +214,8 @@ private static final class DirectIOIndexOutput extends IndexOutput { * bypassing OS buffer * * @throws UnsupportedOperationException if the JDK does not support Direct I/O - * @throws IOException if the operating system or filesystem does not support support Direct I/O - * or a sufficient equivalent. + * @throws IOException if the operating system or filesystem does not support Direct I/O or a + * sufficient equivalent. */ public DirectIOIndexOutput(Path path, String name, int blockSize, int bufferSize) throws IOException { @@ -296,9 +299,10 @@ private static final class DirectIOIndexInput extends IndexInput { private final ByteBuffer buffer; private final FileChannel channel; private final int blockSize; - + private final long offset; + private final long length; + private final boolean isClosable; // clones and slices are not closable private boolean isOpen; - private boolean isClone; private long filePos; /** @@ -311,49 +315,57 @@ private static final class DirectIOIndexInput extends IndexInput { */ public DirectIOIndexInput(Path path, int blockSize, int bufferSize) throws IOException { super("DirectIOIndexInput(path=\"" + path + "\")"); - this.blockSize = blockSize; - this.channel = FileChannel.open(path, StandardOpenOption.READ, getDirectOpenOption()); - this.buffer = ByteBuffer.allocateDirect(bufferSize + blockSize - 1).alignedSlice(blockSize); - - isOpen = true; - isClone = false; - filePos = -bufferSize; - buffer.limit(0); + this.blockSize = blockSize; + this.buffer = allocateBuffer(bufferSize, blockSize); + this.isOpen = true; + this.isClosable = true; + this.length = channel.size(); + this.offset = 0L; + this.filePos = -bufferSize; + this.buffer.limit(0); } - // for clone - private DirectIOIndexInput(DirectIOIndexInput other) throws IOException { - super(other.toString()); - this.channel = other.channel; - this.blockSize = other.blockSize; - + // for clone/slice + private DirectIOIndexInput( + String description, DirectIOIndexInput other, long offset, long length) throws IOException { + super(description); + Objects.checkFromIndexSize(offset, length, other.channel.size()); final int bufferSize = other.buffer.capacity(); - this.buffer = ByteBuffer.allocateDirect(bufferSize + blockSize - 1).alignedSlice(blockSize); - - isOpen = true; - isClone = true; - filePos = -bufferSize; + this.buffer = allocateBuffer(bufferSize, other.blockSize); + this.blockSize = other.blockSize; + this.channel = other.channel; + this.isOpen = true; + this.isClosable = false; + this.length = length; + this.offset = offset; + this.filePos = -bufferSize; buffer.limit(0); - seek(other.getFilePointer()); + } + + private static ByteBuffer allocateBuffer(int bufferSize, int blockSize) { + return ByteBuffer.allocateDirect(bufferSize + blockSize - 1) + .alignedSlice(blockSize) + .order(LITTLE_ENDIAN); } @Override public void close() throws IOException { - if (isOpen && !isClone) { + if (isOpen && isClosable) { channel.close(); + isOpen = false; } } @Override public long getFilePointer() { - long filePointer = filePos + buffer.position(); + long filePointer = filePos + buffer.position() - offset; // opening the input and immediately calling getFilePointer without calling readX (and thus // refill) first, // will result in negative value equal to bufferSize being returned, // due to the initialization method filePos = -bufferSize used in constructor. - assert filePointer == -buffer.capacity() || filePointer >= 0 + assert filePointer == -buffer.capacity() - offset || filePointer >= 0 : "filePointer should either be initial value equal to negative buffer capacity, or larger than or equal to 0"; return Math.max(filePointer, 0); } @@ -361,23 +373,24 @@ public long getFilePointer() { @Override public void seek(long pos) throws IOException { if (pos != getFilePointer()) { - final long alignedPos = pos - (pos % blockSize); - filePos = alignedPos - buffer.capacity(); - - final int delta = (int) (pos - alignedPos); - refill(delta); - buffer.position(delta); + seekInternal(pos); } assert pos == getFilePointer(); } + private void seekInternal(long pos) throws IOException { + final long absPos = pos + offset; + final long alignedPos = absPos - (absPos % blockSize); + filePos = alignedPos - buffer.capacity(); + + final int delta = (int) (absPos - alignedPos); + refill(delta); + buffer.position(delta); + } + @Override public long length() { - try { - return channel.size(); - } catch (IOException ioe) { - throw new UncheckedIOException(ioe); - } + return length; } @Override @@ -389,12 +402,39 @@ public byte readByte() throws IOException { return buffer.get(); } + @Override + public short readShort() throws IOException { + if (buffer.remaining() >= Short.BYTES) { + return buffer.getShort(); + } else { + return super.readShort(); + } + } + + @Override + public int readInt() throws IOException { + if (buffer.remaining() >= Integer.BYTES) { + return buffer.getInt(); + } else { + return super.readInt(); + } + } + + @Override + public long readLong() throws IOException { + if (buffer.remaining() >= Long.BYTES) { + return buffer.getLong(); + } else { + return super.readLong(); + } + } + private void refill(int bytesToRead) throws IOException { filePos += buffer.capacity(); // BaseDirectoryTestCase#testSeekPastEOF test for consecutive read past EOF, // hence throwing EOFException early to maintain buffer state (position in particular) - if (filePos > channel.size() || (channel.size() - filePos < bytesToRead)) { + if (filePos > offset + length || ((offset + length) - filePos < bytesToRead)) { throw new EOFException("read past EOF: " + this); } @@ -428,19 +468,83 @@ public void readBytes(byte[] dst, int offset, int len) throws IOException { } } + @Override + public void readInts(int[] dst, int offset, int len) throws IOException { + int remainingDst = len; + while (remainingDst > 0) { + int cnt = Math.min(buffer.remaining() / Integer.BYTES, remainingDst); + buffer.asIntBuffer().get(dst, offset + len - remainingDst, cnt); + buffer.position(buffer.position() + Integer.BYTES * cnt); + remainingDst -= cnt; + if (remainingDst > 0) { + if (buffer.hasRemaining()) { + dst[offset + len - remainingDst] = readInt(); + --remainingDst; + } else { + refill(remainingDst * Integer.BYTES); + } + } + } + } + + @Override + public void readFloats(float[] dst, int offset, int len) throws IOException { + int remainingDst = len; + while (remainingDst > 0) { + int cnt = Math.min(buffer.remaining() / Float.BYTES, remainingDst); + buffer.asFloatBuffer().get(dst, offset + len - remainingDst, cnt); + buffer.position(buffer.position() + Float.BYTES * cnt); + remainingDst -= cnt; + if (remainingDst > 0) { + if (buffer.hasRemaining()) { + dst[offset + len - remainingDst] = Float.intBitsToFloat(readInt()); + --remainingDst; + } else { + refill(remainingDst * Float.BYTES); + } + } + } + } + + @Override + public void readLongs(long[] dst, int offset, int len) throws IOException { + int remainingDst = len; + while (remainingDst > 0) { + int cnt = Math.min(buffer.remaining() / Long.BYTES, remainingDst); + buffer.asLongBuffer().get(dst, offset + len - remainingDst, cnt); + buffer.position(buffer.position() + Long.BYTES * cnt); + remainingDst -= cnt; + if (remainingDst > 0) { + if (buffer.hasRemaining()) { + dst[offset + len - remainingDst] = readLong(); + --remainingDst; + } else { + refill(remainingDst * Long.BYTES); + } + } + } + } + @Override public DirectIOIndexInput clone() { try { - return new DirectIOIndexInput(this); + var clone = new DirectIOIndexInput("clone:" + this, this, offset, length); + clone.seekInternal(getFilePointer()); + return clone; } catch (IOException ioe) { throw new UncheckedIOException(ioe); } } @Override - public IndexInput slice(String sliceDescription, long offset, long length) { - // TODO: is this the right thing to do? - return BufferedIndexInput.wrap(sliceDescription, this, offset, length); + public IndexInput slice(String sliceDescription, long offset, long length) throws IOException { + if ((length | offset) < 0 || length > this.length - offset) { + throw new IllegalArgumentException( + "slice() " + sliceDescription + " out of bounds: " + this); + } + var slice = new DirectIOIndexInput(sliceDescription, this, this.offset + offset, length); + slice.seekInternal(0L); + return slice; } } } diff --git a/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java b/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java index 420d6d40d6de..cd90db9abbe7 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java @@ -136,7 +136,7 @@ public RAFIndexInput clone() { @Override public IndexInput slice(String sliceDescription, long offset, long length) throws IOException { - if (offset < 0 || length < 0 || offset + length > this.length()) { + if ((length | offset) < 0 || length > this.length() - offset) { throw new IllegalArgumentException( "slice() " + sliceDescription + " out of bounds: " + this); } diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java index 7441e68f7d48..3b484ed2430a 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java @@ -62,7 +62,7 @@ public void setUp() throws Exception { } private void createQuantizedIndex(Directory dir, List vectors) throws IOException { - IndexWriterConfig cfg = newIndexWriterConfig(); + IndexWriterConfig cfg = new IndexWriterConfig(); cfg.setCodec( new Lucene101Codec() { @Override @@ -318,10 +318,7 @@ && angularDifference(t0min, t0max) < angularDifference(t0min, t1max)) public void testIndexReorderDense() throws Exception { List vectors = shuffleVectors(randomLinearVectors()); - // compute the expected ordering - Sorter.DocMap expected = - reorderer.computeValueMap( - FloatVectorValues.fromFloats(vectors, 2), VectorSimilarityFunction.EUCLIDEAN, null); + Path tmpdir = createTempDir(); try (Directory dir = newFSDirectory(tmpdir)) { // create an index with a single leaf @@ -335,6 +332,28 @@ public void testIndexReorderDense() throws Exception { } writer.forceMerge(1); } + + // The docId of the documents might have changed due to merging. Compute a mapping from + // the stored id to the current docId and repopulate the vector list. + int[] storedIdToDocId = new int[vectors.size()]; + vectors.clear(); + try (IndexReader reader = DirectoryReader.open(dir)) { + LeafReader leafReader = getOnlyLeafReader(reader); + FloatVectorValues values = leafReader.getFloatVectorValues("f"); + StoredFields storedFields = reader.storedFields(); + KnnVectorValues.DocIndexIterator it = values.iterator(); + while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + int storedId = Integer.parseInt(storedFields.document(it.docID()).get("id")); + vectors.add(values.vectorValue(it.index()).clone()); + storedIdToDocId[storedId] = it.docID(); + } + } + + // compute the expected ordering + Sorter.DocMap expected = + reorderer.computeValueMap( + FloatVectorValues.fromFloats(vectors, 2), VectorSimilarityFunction.EUCLIDEAN, null); + int threadCount = random().nextInt(4) + 1; threadCount = 1; // reorder using the index reordering tool @@ -355,12 +374,13 @@ public void testIndexReorderDense() throws Exception { StoredFields storedFields = reader.storedFields(); KnnVectorValues.DocIndexIterator it = values.iterator(); while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - int storedId = Integer.parseInt(storedFields.document(it.docID()).get("id")); - assertEquals(expected.oldToNew(storedId), newId); + int oldDocId = + storedIdToDocId[Integer.parseInt(storedFields.document(it.docID()).get("id"))]; + assertEquals(expected.oldToNew(oldDocId), newId); float[] expectedVector = vectors.get(expected.newToOld(it.docID())); float[] actualVector = values.vectorValue(it.index()); assertArrayEquals( - "values differ at index " + storedId + "->" + newId + " docid=" + it.docID(), + "values differ at index " + oldDocId + "->" + newId + " docid=" + it.docID(), expectedVector, actualVector, 0); @@ -380,7 +400,7 @@ public void testIndexReorderSparse() throws Exception { int maxDoc = 0; try (Directory dir = newFSDirectory(tmpdir)) { // create an index with a single leaf - try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig())) { + try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) { for (float[] vector : vectors) { Document doc = new Document(); if (random().nextBoolean()) { @@ -394,7 +414,6 @@ public void testIndexReorderSparse() throws Exception { writer.addDocument(doc); maxDoc++; } - writer.forceMerge(1); } // reorder using the index reordering tool BpVectorReorderer.main( diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/package-info.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/package-info.java index 4d20996e41bd..f3c1ded1f1ff 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/package-info.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/package-info.java @@ -15,83 +15,5 @@ * limitations under the License. */ -/** - * Flexible query parser is a modular, extensible framework for implementing Lucene query parsers. - * In the flexible query parser model, query parsing takes three steps: syntax parsing, processing - * (query semantics) and building (conversion to a Lucene {@link org.apache.lucene.search.Query}). - * - *

    The flexible query parser module provides not just the framework but also the {@linkplain - * org.apache.lucene.queryparser.flexible.standard.StandardQueryParser} - the default implementation - * of a fully fledged query parser that supports most of the classic query parser's syntax but also - * adds support for interval functions, min-should-match operator on Boolean groups and many hooks - * for customization of how the parser behaves at runtime. - * - *

    The flexible query parser is divided in two packages: - * - *

      - *
    • {@link org.apache.lucene.queryparser.flexible.core}: contains the query parser API classes, - * which should be extended by custom query parser implementations. - *
    • {@link org.apache.lucene.queryparser.flexible.standard}: contains an example Lucene query - * parser implementation built on top of the flexible query parser API. - *
    - * - *

    Features

    - * - *
      - *
    1. full support for Boolean expressions, including groups - *
    2. {@linkplain org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser syntax parsers} - * - support for arbitrary syntax parsers, that can be converted into {@link - * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} trees. - *
    3. {@linkplain org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor query - * node processors} - optimize, validate, rewrite the {@link - * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} trees - *
    4. {@linkplain - * org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorPipeline processor - * pipelines} - select your favorite query processors and build a pipeline to implement the - * features you need. - *
    5. {@linkplain org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler query - * configuration handlers} - *
    6. {@linkplain org.apache.lucene.queryparser.flexible.core.builders.QueryBuilder query - * builders} - convert {@link org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} - * trees into Lucene {@link org.apache.lucene.search.Query} instances. - *
    - * - *

    Design

    - * - *

    The flexible query parser was designed to have a very generic architecture, so that it can be - * easily used for different products with varying query syntax needs. - * - *

    The query parser has three layers and its core is what we call the {@linkplain - * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode query node tree}. It is a tree of - * objects that represent the syntax of the original query, for example, for 'a AND b' the tree - * could look like this: - * - *

    - *       AND
    - *      /   \
    - *     A     B
    - * 
    - * - *

    The three flexible query parser layers are: - * - *

    - *
    {@link org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser} - *
    This layer is the text parsing layer which simply transforms the query text string into a - * {@link org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} tree. Every text parser - * must implement the interface {@link - * org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser}. The default - * implementation is {@link - * org.apache.lucene.queryparser.flexible.standard.parser.StandardSyntaxParser}. - *
    {@link org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor} - *
    The query node processor does most of the work: it contains a chain of {@linkplain - * org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor query node - * processors}. Each processor can walk the tree and modify nodes or even the tree's - * structure. This allows for query optimization before the node tree is converted to an - * actual query. - *
    {@link org.apache.lucene.queryparser.flexible.core.builders.QueryBuilder} - *
    The third layer is a configurable map of builders, which map {@linkplain - * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode query nodes} to their adapters - * that convert each node into a {@link org.apache.lucene.search.Query}. - *
    - */ +/** */ package org.apache.lucene.queryparser.flexible; diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/package-info.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/package-info.java index 569df7a029cf..9d02e8aff020 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/package-info.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/package-info.java @@ -22,10 +22,83 @@ * operations. In the new query parser structure, the parsing was divided in 3 steps: parsing * (syntax), processing (semantic) and building. * - *

    The classes contained in the package org.apache.lucene.queryParser.standard are used to - * reproduce the same behavior as the old query parser. + *

    Flexible query parser is a modular, extensible framework for implementing Lucene query + * parsers. In the flexible query parser model, query parsing takes three steps: syntax parsing, + * processing (query semantics) and building (conversion to a Lucene {@link + * org.apache.lucene.search.Query}). * - *

    Check {@link org.apache.lucene.queryparser.flexible.standard.StandardQueryParser} to quick - * start using the Lucene query parser. + *

    The flexible query parser module provides not just the framework but also the {@linkplain + * org.apache.lucene.queryparser.flexible.standard.StandardQueryParser} - the default implementation + * of a fully fledged query parser that supports most of the classic query parser's syntax but also + * adds support for interval functions, min-should-match operator on Boolean groups and many hooks + * for customization of how the parser behaves at runtime. + * + *

    The flexible query parser is divided in two packages: + * + *

      + *
    • {@link org.apache.lucene.queryparser.flexible.core}: contains the query parser API classes, + * which should be extended by custom query parser implementations. + *
    • {@link org.apache.lucene.queryparser.flexible.standard}: contains an example Lucene query + * parser implementation built on top of the flexible query parser API. + *
    + * + *

    Features

    + * + *
      + *
    1. full support for Boolean expressions, including groups + *
    2. {@linkplain org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser syntax parsers} + * - support for arbitrary syntax parsers, that can be converted into {@link + * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} trees. + *
    3. {@linkplain org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor query + * node processors} - optimize, validate, rewrite the {@link + * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} trees + *
    4. {@linkplain + * org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorPipeline processor + * pipelines} - select your favorite query processors and build a pipeline to implement the + * features you need. + *
    5. {@linkplain org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler query + * configuration handlers} + *
    6. {@linkplain org.apache.lucene.queryparser.flexible.core.builders.QueryBuilder query + * builders} - convert {@link org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} + * trees into Lucene {@link org.apache.lucene.search.Query} instances. + *
    + * + *

    Design

    + * + *

    The flexible query parser was designed to have a very generic architecture, so that it can be + * easily used for different products with varying query syntax needs. + * + *

    The query parser has three layers and its core is what we call the {@linkplain + * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode query node tree}. It is a tree of + * objects that represent the syntax of the original query, for example, for 'a AND b' the tree + * could look like this: + * + *

    + *       AND
    + *      /   \
    + *     A     B
    + * 
    + * + *

    The three flexible query parser layers are: + * + *

    + *
    {@link org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser} + *
    This layer is the text parsing layer which simply transforms the query text string into a + * {@link org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} tree. Every text parser + * must implement the interface {@link + * org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser}. The default + * implementation is {@link + * org.apache.lucene.queryparser.flexible.standard.parser.StandardSyntaxParser}. + *
    {@link org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor} + *
    The query node processor does most of the work: it contains a chain of {@linkplain + * org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor query node + * processors}. Each processor can walk the tree and modify nodes or even the tree's + * structure. This allows for query optimization before the node tree is converted to an + * actual query. + *
    {@link org.apache.lucene.queryparser.flexible.core.builders.QueryBuilder} + *
    The third layer is a configurable map of builders, which map {@linkplain + * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode query nodes} to their adapters + * that convert each node into a {@link org.apache.lucene.search.Query}. + *
    */ package org.apache.lucene.queryparser.flexible.standard; diff --git a/lucene/queryparser/src/java/overview.html b/lucene/queryparser/src/java/overview.html index 2b6f8a446afb..a7c579dd836f 100644 --- a/lucene/queryparser/src/java/overview.html +++ b/lucene/queryparser/src/java/overview.html @@ -27,16 +27,16 @@

    Apache Lucene QueryParsers.

    This module provides a number of query parsers:

      -
    • {@linkplain org.apache.lucene.queryparser.flexible flexible query parser} +
    • {@linkplain org.apache.lucene.queryparser.flexible.standard flexible query parser}
    • {@linkplain org.apache.lucene.queryparser.classic classic query parser}
    • {@linkplain org.apache.lucene.queryparser.complexPhrase complex phrase query parser}
    • {@linkplain org.apache.lucene.queryparser.ext extendable query parser} -
    • {@linkplain org.apache.lucene.queryparser.surround surround query parser (span queries)} +
    • {@linkplain org.apache.lucene.queryparser.surround.parser surround query parser (span queries)}
    • {@linkplain org.apache.lucene.queryparser.xml query parser building Query objects from XML}

    - If you're new to query parsers, the {@linkplain org.apache.lucene.queryparser.flexible flexible query parser}'s + If you're new to query parsers, the {@linkplain org.apache.lucene.queryparser.flexible.standard flexible query parser}'s {@link org.apache.lucene.queryparser.flexible.standard.StandardQueryParser} is probably a good place to start. diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java index 8c1f5fd71e3a..c05f5e028a08 100644 --- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java +++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java @@ -164,7 +164,6 @@ private static IndexWriter initWriter( if (mp instanceof TieredMergePolicy) { TieredMergePolicy tmp = (TieredMergePolicy) mp; tmp.setSegmentsPerTier(3); - tmp.setMaxMergeAtOnce(3); } else if (mp instanceof LogMergePolicy) { LogMergePolicy lmp = (LogMergePolicy) mp; lmp.setMergeFactor(3); diff --git a/lucene/sandbox/build.gradle b/lucene/sandbox/build.gradle index 72762fe1c3d2..6d225fd78ba4 100644 --- a/lucene/sandbox/build.gradle +++ b/lucene/sandbox/build.gradle @@ -19,9 +19,16 @@ apply plugin: 'java-library' description = 'Various third party contributions and new ideas' +repositories { + mavenLocal() +} + + dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:queries') moduleApi project(':lucene:facet') moduleTestImplementation project(':lucene:test-framework') + moduleImplementation deps.commons.lang3 + moduleImplementation deps.cuvs } diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index f40a05af433a..59e89cfd0bf0 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -20,6 +20,8 @@ requires org.apache.lucene.core; requires org.apache.lucene.queries; requires org.apache.lucene.facet; + requires java.logging; + requires com.nvidia.cuvs; exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; @@ -34,7 +36,12 @@ exports org.apache.lucene.sandbox.facet.iterators; exports org.apache.lucene.sandbox.facet.cutters; exports org.apache.lucene.sandbox.facet.labels; + exports org.apache.lucene.sandbox.vectorsearch; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; + provides org.apache.lucene.codecs.KnnVectorsFormat with + org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; + provides com.nvidia.cuvs.spi.CuVSServiceProvider with + org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java index dfedb51ed1f4..09e2bb57af7a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java @@ -51,7 +51,7 @@ final class CoveringScorer extends Scorer { this.minMatchValues = minMatchValues; this.doc = -1; - subScorers = new DisiPriorityQueue(scorers.size()); + subScorers = DisiPriorityQueue.ofMaxSize(scorers.size()); for (Scorer scorer : scorers) { subScorers.add(new DisiWrapper(scorer, false)); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java new file mode 100644 index 000000000000..c3ddc809c4d3 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.LibraryException; +import java.util.logging.Logger; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.IndexType; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; + +/** CuVS based codec for GPU based vector search */ +public class CuVSCodec extends FilterCodec { + + public CuVSCodec() { + this("CuVSCodec", new Lucene101Codec()); + } + + public CuVSCodec(String name, Codec delegate) { + super(name, delegate); + KnnVectorsFormat format; + try { + format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA); + setKnnFormat(format); + } catch (LibraryException ex) { + Logger log = Logger.getLogger(CuVSCodec.class.getName()); + log.severe("Couldn't load native library, possible classloader issue. " + ex.getMessage()); + } + } + + KnnVectorsFormat knnFormat = null; + + @Override + public KnnVectorsFormat knnVectorsFormat() { + return knnFormat; + } + + public void setKnnFormat(KnnVectorsFormat format) { + this.knnFormat = format; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java new file mode 100644 index 000000000000..61b8f0879202 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.util.List; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; +import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.util.RamUsageEstimator; + +/** CuVS based fields writer */ +/*package-private*/ class CuVSFieldWriter extends KnnFieldVectorsWriter { + + private static final long SHALLOW_SIZE = + RamUsageEstimator.shallowSizeOfInstance(CuVSFieldWriter.class); + + private final FieldInfo fieldInfo; + private final FlatFieldVectorsWriter flatFieldVectorsWriter; + private int lastDocID = -1; + + public CuVSFieldWriter( + FieldInfo fieldInfo, FlatFieldVectorsWriter flatFieldVectorsWriter) { + this.fieldInfo = fieldInfo; + this.flatFieldVectorsWriter = flatFieldVectorsWriter; + } + + @Override + public void addValue(int docID, float[] vectorValue) throws IOException { + if (docID == lastDocID) { + throw new IllegalArgumentException( + "VectorValuesField \"" + + fieldInfo.name + + "\" appears more than once in this document (only one value is allowed per field)"); + } + flatFieldVectorsWriter.addValue(docID, vectorValue); + } + + List getVectors() { + return flatFieldVectorsWriter.getVectors(); + } + + FieldInfo fieldInfo() { + return fieldInfo; + } + + DocsWithFieldSet getDocsWithFieldSet() { + return flatFieldVectorsWriter.getDocsWithFieldSet(); + } + + @Override + public float[] copyValue(float[] vectorValue) { + throw new UnsupportedOperationException(); + } + + @Override + public long ramBytesUsed() { + return SHALLOW_SIZE + flatFieldVectorsWriter.ramBytesUsed(); + } + + @Override + public String toString() { + return "CuVSFieldWriter[field name=" + fieldInfo.name + ", number=" + fieldInfo.number + "]"; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java new file mode 100644 index 000000000000..d0cfe86d708e --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsReader.handleThrowable; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.HnswIndex; +import java.io.Closeable; +import java.io.IOException; +import java.util.Objects; + +/** This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) */ +public class CuVSIndex implements Closeable { + private final CagraIndex cagraIndex; + private final BruteForceIndex bruteforceIndex; + private final HnswIndex hnswIndex; + + private int maxDocs; + private String fieldName; + private String segmentName; + private volatile boolean closed; + + public CuVSIndex( + String segmentName, + String fieldName, + CagraIndex cagraIndex, + int maxDocs, + BruteForceIndex bruteforceIndex) { + this.cagraIndex = Objects.requireNonNull(cagraIndex); + this.bruteforceIndex = Objects.requireNonNull(bruteforceIndex); + this.fieldName = Objects.requireNonNull(fieldName); + this.segmentName = Objects.requireNonNull(segmentName); + if (maxDocs < 0) { + throw new IllegalArgumentException("negative maxDocs:" + maxDocs); + } + this.maxDocs = maxDocs; + this.hnswIndex = null; // TODO: + } + + public CuVSIndex(CagraIndex cagraIndex, BruteForceIndex bruteforceIndex, HnswIndex hnswIndex) { + this.cagraIndex = cagraIndex; + this.bruteforceIndex = bruteforceIndex; + this.hnswIndex = hnswIndex; + } + + public CagraIndex getCagraIndex() { + ensureOpen(); + return cagraIndex; + } + + public BruteForceIndex getBruteforceIndex() { + ensureOpen(); + return bruteforceIndex; + } + + public HnswIndex getHNSWIndex() { + ensureOpen(); + return hnswIndex; + } + + public String getFieldName() { + return fieldName; + } + + public String getSegmentName() { + return segmentName; + } + + public int getMaxDocs() { + return maxDocs; + } + + private void ensureOpen() { + if (closed) { + throw new IllegalStateException("index is closed"); + } + } + + @Override + public void close() throws IOException { + if (closed) { + return; + } + closed = true; + destroyIndices(); + } + + private void destroyIndices() throws IOException { + try { + if (cagraIndex != null) { + cagraIndex.destroyIndex(); + } + if (bruteforceIndex != null) { + bruteforceIndex.destroyIndex(); + } + if (hnswIndex != null) { + hnswIndex.destroyIndex(); + } + } catch (Throwable t) { + handleThrowable(t); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java new file mode 100644 index 000000000000..2f6c636590ef --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.knn.KnnCollectorManager; +import org.apache.lucene.util.Bits; + +/** Query for CuVS */ +public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { + + private final int iTopK; + private final int searchWidth; + + public CuVSKnnFloatVectorQuery(String field, float[] target, int k, int iTopK, int searchWidth) { + super(field, target, k); + this.iTopK = iTopK; + this.searchWidth = searchWidth; + } + + @Override + protected TopDocs approximateSearch( + LeafReaderContext context, + Bits acceptDocs, + int visitedLimit, + KnnCollectorManager knnCollectorManager) + throws IOException { + + PerLeafCuVSKnnCollector results = new PerLeafCuVSKnnCollector(k, iTopK, searchWidth); + + LeafReader reader = context.reader(); + reader.searchNearestVectors(field, this.getTargetCopy(), results, null); + return results.topDocs(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java new file mode 100644 index 000000000000..9b12cdf61012 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; +import java.util.zip.Deflater; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +/** Methods to deal with a CuVS composite file inside a segment */ +/*package-private*/ class CuVSSegmentFile implements AutoCloseable { + private final ZipOutputStream zos; + + private Set filesAdded = new HashSet(); + + public CuVSSegmentFile(OutputStream out) { + zos = new ZipOutputStream(out); + zos.setLevel(Deflater.NO_COMPRESSION); + } + + protected Logger log = Logger.getLogger(getClass().getName()); + + public void addFile(String name, byte[] bytes) throws IOException { + /*log.info( + "Writing the file: " + + name + + ", size=" + + bytes.length);*/ + ZipEntry indexFileZipEntry = new ZipEntry(name); + zos.putNextEntry(indexFileZipEntry); + zos.write(bytes, 0, bytes.length); + zos.closeEntry(); + filesAdded.add(name); + } + + public Set getFilesAdded() { + return Collections.unmodifiableSet(filesAdded); + } + + @Override + public void close() throws IOException { + zos.close(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java new file mode 100644 index 000000000000..e0d4678aa5fe --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.LibraryException; +import java.io.IOException; +import java.util.logging.Logger; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.IndexType; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; + +/** CuVS based KnnVectorsFormat for GPU acceleration */ +public class CuVSVectorsFormat extends KnnVectorsFormat { + + private static final Logger LOG = Logger.getLogger(CuVSVectorsFormat.class.getName()); + + // TODO: fix Lucene version in name, to the final targeted release, if any + static final String CUVS_META_CODEC_NAME = "Lucene102CuVSVectorsFormatMeta"; + static final String CUVS_META_CODEC_EXT = "vemc"; // ""cagmf"; + static final String CUVS_INDEX_CODEC_NAME = "Lucene102CuVSVectorsFormatIndex"; + static final String CUVS_INDEX_EXT = "vcag"; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + public static final int DEFAULT_WRITER_THREADS = 32; + public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; + public static final int DEFAULT_GRAPH_DEGREE = 64; + public static final MergeStrategy DEFAULT_MERGE_STRATEGY = MergeStrategy.NON_TRIVIAL_MERGE; + public static final IndexType DEFAULT_INDEX_TYPE = IndexType.CAGRA; + + static CuVSResources resources = cuVSResourcesOrNull(); + + /** The format for storing, reading, and merging raw vectors on disk. */ + private static final FlatVectorsFormat flatVectorsFormat = + new Lucene99FlatVectorsFormat(DefaultFlatVectorScorer.INSTANCE); + + final int maxDimensions = 4096; + final int cuvsWriterThreads; + final int intGraphDegree; + final int graphDegree; + final MergeStrategy mergeStrategy; + final CuVSVectorsWriter.IndexType indexType; // the index type to build, when writing + + /** + * Creates a CuVSVectorsFormat, with default values. + * + * @throws LibraryException if the native library fails to load + */ + public CuVSVectorsFormat() { + this( + DEFAULT_WRITER_THREADS, + DEFAULT_INTERMEDIATE_GRAPH_DEGREE, + DEFAULT_GRAPH_DEGREE, + DEFAULT_MERGE_STRATEGY, + DEFAULT_INDEX_TYPE); + } + + /** + * Creates a CuVSVectorsFormat, with the given threads, graph degree, etc. + * + * @throws LibraryException if the native library fails to load + */ + public CuVSVectorsFormat( + int cuvsWriterThreads, + int intGraphDegree, + int graphDegree, + MergeStrategy mergeStrategy, + IndexType indexType) { + super("CuVSVectorsFormat"); + this.mergeStrategy = mergeStrategy; + this.cuvsWriterThreads = cuvsWriterThreads; + this.intGraphDegree = intGraphDegree; + this.graphDegree = graphDegree; + this.indexType = indexType; + } + + private static CuVSResources cuVSResourcesOrNull() { + try { + resources = CuVSResources.create(); + return resources; + } catch (UnsupportedOperationException uoe) { + LOG.warning("cuvs is not supported on this platform or java version: " + uoe.getMessage()); + } catch (Throwable t) { + if (t instanceof ExceptionInInitializerError ex) { + t = ex.getCause(); + } + LOG.warning("Exception occurred during creation of cuvs resources. " + t); + } + return null; + } + + /** Tells whether the platform supports cuvs. */ + public static boolean supported() { + return resources != null; + } + + private static void checkSupported() { + if (!supported()) { + throw new UnsupportedOperationException(); + } + } + + @Override + public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + checkSupported(); + var flatWriter = flatVectorsFormat.fieldsWriter(state); + return new CuVSVectorsWriter( + state, + cuvsWriterThreads, + intGraphDegree, + graphDegree, + mergeStrategy, + indexType, + resources, + flatWriter); + } + + @Override + public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException { + checkSupported(); + var flatReader = flatVectorsFormat.fieldsReader(state); + return new CuVSVectorsReader(state, resources, flatReader); + } + + @Override + public int getMaxDimensions(String fieldName) { + return maxDimensions; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("CuVSVectorsFormat("); + sb.append("cuvsWriterThreads=").append(cuvsWriterThreads); + sb.append("intGraphDegree=").append(intGraphDegree); + sb.append("graphDegree=").append(graphDegree); + sb.append("mergeStrategy=").append(mergeStrategy); + sb.append("resources=").append(resources); + sb.append(")"); + return sb.toString(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java new file mode 100644 index 000000000000..cfb59121e36e --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -0,0 +1,487 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_START; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceQuery; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraQuery; +import com.nvidia.cuvs.CagraSearchParams; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.HnswIndexParams; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.logging.Logger; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.hnsw.FlatVectorsReader; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.hnsw.IntToIntFunction; + +/** KnnVectorsReader instance associated with CuVS format */ +public class CuVSVectorsReader extends KnnVectorsReader { + + @SuppressWarnings("unused") + private static final Logger log = Logger.getLogger(CuVSVectorsReader.class.getName()); + + private final CuVSResources resources; + private final FlatVectorsReader flatVectorsReader; // for reading the raw vectors + private final FieldInfos fieldInfos; + private final IntObjectHashMap fields; + private final IntObjectHashMap cuvsIndices; + private final IndexInput cuvsIndexInput; + + public CuVSVectorsReader( + SegmentReadState state, CuVSResources resources, FlatVectorsReader flatReader) + throws IOException { + this.resources = resources; + this.flatVectorsReader = flatReader; + this.fieldInfos = state.fieldInfos; + this.fields = new IntObjectHashMap<>(); + + String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, CUVS_META_CODEC_EXT); + boolean success = false; + int versionMeta = -1; + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { + Throwable priorException = null; + try { + versionMeta = + CodecUtil.checkIndexHeader( + meta, + CUVS_META_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + readFields(meta); + } catch (Throwable exception) { + priorException = exception; + } finally { + CodecUtil.checkFooter(meta, priorException); + } + var ioContext = state.context.withReadAdvice(ReadAdvice.SEQUENTIAL); + cuvsIndexInput = openCuVSInput(state, versionMeta, ioContext); + cuvsIndices = loadCuVSIndices(); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + private static IndexInput openCuVSInput( + SegmentReadState state, int versionMeta, IOContext context) throws IOException { + String fileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, CUVS_INDEX_EXT); + IndexInput in = state.directory.openInput(fileName, context); + boolean success = false; + try { + int versionVectorData = + CodecUtil.checkIndexHeader( + in, + CUVS_INDEX_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + checkVersion(versionMeta, versionVectorData, in); + CodecUtil.retrieveChecksum(in); + success = true; + return in; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(in); + } + } + } + + private void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) { + int dimension = info.getVectorDimension(); + if (dimension != fieldEntry.dims()) { + throw new IllegalStateException( + "Inconsistent vector dimension for field=\"" + + info.name + + "\"; " + + dimension + + " != " + + fieldEntry.dims()); + } + } + + private void readFields(ChecksumIndexInput meta) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { + FieldInfo info = fieldInfos.fieldInfo(fieldNumber); + if (info == null) { + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); + } + FieldEntry fieldEntry = readField(meta, info); + validateFieldEntry(info, fieldEntry); + fields.put(info.number, fieldEntry); + } + } + + // List of vector similarity functions. This list is defined here, in order + // to avoid an undesirable dependency on the declaration and order of values + // in VectorSimilarityFunction. The list values and order must be identical + // to that of {@link o.a.l.c.l.Lucene94FieldInfosFormat#SIMILARITY_FUNCTIONS}. + static final List SIMILARITY_FUNCTIONS = + List.of( + VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT); + + static VectorSimilarityFunction readSimilarityFunction(DataInput input) throws IOException { + int i = input.readInt(); + if (i < 0 || i >= SIMILARITY_FUNCTIONS.size()) { + throw new IllegalArgumentException("invalid distance function: " + i); + } + return SIMILARITY_FUNCTIONS.get(i); + } + + static VectorEncoding readVectorEncoding(DataInput input) throws IOException { + int encodingId = input.readInt(); + if (encodingId < 0 || encodingId >= VectorEncoding.values().length) { + throw new CorruptIndexException("Invalid vector encoding id: " + encodingId, input); + } + return VectorEncoding.values()[encodingId]; + } + + private FieldEntry readField(IndexInput input, FieldInfo info) throws IOException { + VectorEncoding vectorEncoding = readVectorEncoding(input); + VectorSimilarityFunction similarityFunction = readSimilarityFunction(input); + if (similarityFunction != info.getVectorSimilarityFunction()) { + throw new IllegalStateException( + "Inconsistent vector similarity function for field=\"" + + info.name + + "\"; " + + similarityFunction + + " != " + + info.getVectorSimilarityFunction()); + } + return FieldEntry.readEntry(input, vectorEncoding, info.getVectorSimilarityFunction()); + } + + private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + if (fieldEntry.vectorEncoding != expectedEncoding) { + throw new IllegalArgumentException( + "field=\"" + + field + + "\" is encoded as: " + + fieldEntry.vectorEncoding + + " expected: " + + expectedEncoding); + } + return fieldEntry; + } + + private IntObjectHashMap loadCuVSIndices() throws IOException { + var indices = new IntObjectHashMap(); + for (var e : fields) { + var fieldEntry = e.value; + int fieldNumber = e.key; + var cuvsIndex = loadCuVSIndex(fieldEntry); + indices.put(fieldNumber, cuvsIndex); + } + return indices; + } + + private CuVSIndex loadCuVSIndex(FieldEntry fieldEntry) throws IOException { + CagraIndex cagraIndex = null; + BruteForceIndex bruteForceIndex = null; + HnswIndex hnswIndex = null; + + try { + long len = fieldEntry.cagraIndexLength(); + if (len > 0) { + long off = fieldEntry.cagraIndexOffset(); + try (var slice = cuvsIndexInput.slice("cagra index", off, len); + var in = new IndexInputInputStream(slice)) { + cagraIndex = CagraIndex.newBuilder(resources).from(in).build(); + } + } + + len = fieldEntry.bruteForceIndexLength(); + if (len > 0) { + long off = fieldEntry.bruteForceIndexOffset(); + try (var slice = cuvsIndexInput.slice("bf index", off, len); + var in = new IndexInputInputStream(slice)) { + bruteForceIndex = BruteForceIndex.newBuilder(resources).from(in).build(); + } + } + + len = fieldEntry.hnswIndexLength(); + if (len > 0) { + long off = fieldEntry.hnswIndexOffset(); + try (var slice = cuvsIndexInput.slice("hnsw index", off, len); + var in = new IndexInputInputStream(slice)) { + var params = new HnswIndexParams.Builder().build(); + hnswIndex = HnswIndex.newBuilder(resources).withIndexParams(params).from(in).build(); + } + } + } catch (Throwable t) { + handleThrowable(t); + } + return new CuVSIndex(cagraIndex, bruteForceIndex, hnswIndex); + } + + @Override + public void close() throws IOException { + var closeableStream = + Stream.concat( + Stream.of(flatVectorsReader, cuvsIndexInput), + stream(cuvsIndices.values().iterator()).map(cursor -> cursor.value)); + IOUtils.close(closeableStream::iterator); + } + + static Stream stream(Iterator iterator) { + return StreamSupport.stream(((Iterable) () -> iterator).spliterator(), false); + } + + @Override + public void checkIntegrity() throws IOException { + // TODO: Pending implementation + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + return flatVectorsReader.getFloatVectorValues(field); + } + + @Override + public ByteVectorValues getByteVectorValues(String field) { + throw new UnsupportedOperationException("byte vectors not supported"); + } + + /** Native float to float function */ + public interface FloatToFloatFunction { + float apply(float v); + } + + static long[] bitsToLongArray(Bits bits) { + if (bits instanceof FixedBitSet fixedBitSet) { + return fixedBitSet.getBits(); + } else { + return FixedBitSet.copyOf(bits).getBits(); + } + } + + static FloatToFloatFunction getScoreNormalizationFunc(VectorSimilarityFunction sim) { + // TODO: check for different similarities + return score -> (1f / (1f + score)); + } + + // This is a hack - https://github.com/rapidsai/cuvs/issues/696 + static final int FILTER_OVER_SAMPLE = 10; + + @Override + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + var fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); + if (fieldEntry.count() == 0 || knnCollector.k() == 0) { + return; + } + + var fieldNumber = fieldInfos.fieldInfo(field).number; + // log.info("fieldNumber=" + fieldNumber + ", fieldEntry.count()=" + fieldEntry.count()); + + CuVSIndex cuvsIndex = cuvsIndices.get(fieldNumber); + if (cuvsIndex == null) { + throw new IllegalStateException("not index found for field:" + field); + } + + int collectorTopK = knnCollector.k(); + if (acceptDocs != null) { + collectorTopK = knnCollector.k() * FILTER_OVER_SAMPLE; + } + final int topK = Math.min(collectorTopK, fieldEntry.count()); + assert topK > 0 : "Expected topK > 0, got:" + topK; + + Map result; + if (knnCollector.k() <= 1024 && cuvsIndex.getCagraIndex() != null) { + // log.info("searching cagra index"); + CagraSearchParams searchParams = + new CagraSearchParams.Builder(resources) + .withItopkSize(topK) // TODO: params + .withSearchWidth(1) + .build(); + + var query = + new CagraQuery.Builder() + .withTopK(topK) + .withSearchParams(searchParams) + // we don't use ord to doc mapping, https://github.com/rapidsai/cuvs/issues/699 + .withMapping(null) + .withQueryVectors(new float[][] {target}) + .build(); + + CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); + List> searchResult = null; + try { + searchResult = cagraIndex.search(query).getResults(); + } catch (Throwable t) { + handleThrowable(t); + } + // List expected to have only one entry because of single query "target". + assert searchResult.size() == 1; + result = searchResult.getFirst(); + } else { + BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); + assert bruteforceIndex != null; + // log.info("searching brute index, with actual topK=" + topK); + var queryBuilder = + new BruteForceQuery.Builder().withQueryVectors(new float[][] {target}).withTopK(topK); + BruteForceQuery query = queryBuilder.build(); + + List> searchResult = null; + try { + searchResult = bruteforceIndex.search(query).getResults(); + } catch (Throwable t) { + handleThrowable(t); + } + assert searchResult.size() == 1; + result = searchResult.getFirst(); + } + assert result != null; + + final var rawValues = flatVectorsReader.getFloatVectorValues(field); + final Bits acceptedOrds = rawValues.getAcceptOrds(acceptDocs); + final var ordToDocFunction = (IntToIntFunction) rawValues::ordToDoc; + final var scoreCorrectionFunction = getScoreNormalizationFunc(fieldEntry.similarityFunction); + + for (var entry : result.entrySet()) { + int ord = entry.getKey(); + float score = entry.getValue(); + if (acceptedOrds == null || acceptedOrds.get(ord)) { + if (knnCollector.earlyTerminated()) { + break; + } + assert ord >= 0 : "unexpected ord: " + ord; + int doc = ordToDocFunction.apply(ord); + float correctedScore = scoreCorrectionFunction.apply(score); + knnCollector.incVisitedCount(1); + knnCollector.collect(doc, correctedScore); + } + } + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + throw new UnsupportedOperationException("byte vectors not supported"); + } + + record FieldEntry( + VectorEncoding vectorEncoding, + VectorSimilarityFunction similarityFunction, + int dims, + int count, + long cagraIndexOffset, + long cagraIndexLength, + long bruteForceIndexOffset, + long bruteForceIndexLength, + long hnswIndexOffset, + long hnswIndexLength) { + + static FieldEntry readEntry( + IndexInput input, + VectorEncoding vectorEncoding, + VectorSimilarityFunction similarityFunction) + throws IOException { + var dims = input.readInt(); + var count = input.readInt(); + var cagraIndexOffset = input.readVLong(); + var cagraIndexLength = input.readVLong(); + var bruteForceIndexOffset = input.readVLong(); + var bruteForceIndexLength = input.readVLong(); + var hnswIndexOffset = input.readVLong(); + var hnswIndexLength = input.readVLong(); + return new FieldEntry( + vectorEncoding, + similarityFunction, + dims, + count, + cagraIndexOffset, + cagraIndexLength, + bruteForceIndexOffset, + bruteForceIndexLength, + hnswIndexOffset, + hnswIndexLength); + } + } + + static void checkVersion(int versionMeta, int versionVectorData, IndexInput in) + throws CorruptIndexException { + if (versionMeta != versionVectorData) { + throw new CorruptIndexException( + "Format versions mismatch: meta=" + + versionMeta + + ", " + + CUVS_META_CODEC_NAME + + "=" + + versionVectorData, + in); + } + } + + static void handleThrowable(Throwable t) throws IOException { + switch (t) { + case IOException ioe -> throw ioe; + case Error error -> throw error; + case RuntimeException re -> throw re; + case null, default -> throw new RuntimeException("UNEXPECTED: exception type", t); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java new file mode 100644 index 000000000000..61f77ee26e7c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -0,0 +1,505 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS; +import static org.apache.lucene.index.VectorEncoding.FLOAT32; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsReader.handleThrowable; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceIndexParams; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraIndexParams; +import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; +import com.nvidia.cuvs.CuVSResources; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.logging.Logger; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; +import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter; +import org.apache.lucene.index.Sorter.DocMap; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InfoStream; + +/** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ +public class CuVSVectorsWriter extends KnnVectorsWriter { + + private static final long SHALLOW_RAM_BYTES_USED = shallowSizeOfInstance(CuVSVectorsWriter.class); + + @SuppressWarnings("unused") + private static final Logger log = Logger.getLogger(CuVSVectorsWriter.class.getName()); + + /** The name of the CUVS component for the info-stream * */ + public static final String CUVS_COMPONENT = "CUVS"; + + // The minimum number of vectors in the dataset required before + // we attempt to build a Cagra index + static final int MIN_CAGRA_INDEX_SIZE = 2; + + private final int cuvsWriterThreads; + private final int intGraphDegree; + private final int graphDegree; + + private final CuVSResources resources; + private final IndexType indexType; + + @SuppressWarnings("unused") + private final MergeStrategy mergeStrategy; + + private final FlatVectorsWriter flatVectorsWriter; // for writing the raw vectors + private final List fields = new ArrayList<>(); + private final IndexOutput meta, cuvsIndex; + private final InfoStream infoStream; + private boolean finished; + + /** Merge strategy used for CuVS */ + public enum MergeStrategy { + TRIVIAL_MERGE, + NON_TRIVIAL_MERGE + } + + /** The CuVS index Type. */ + public enum IndexType { + /** Builds a Cagra index. */ + CAGRA(true, false, false), + /** Builds a Brute Force index. */ + BRUTE_FORCE(false, true, false), + /** Builds an HSNW index - suitable for searching on CPU. */ + HNSW(false, false, true), + /** Builds a Cagra and a Brute Force index. */ + CAGRA_AND_BRUTE_FORCE(true, true, false); + private final boolean cagra, bruteForce, hnsw; + + IndexType(boolean cagra, boolean bruteForce, boolean hnsw) { + this.cagra = cagra; + this.bruteForce = bruteForce; + this.hnsw = hnsw; + } + + public boolean cagra() { + return cagra; + } + + public boolean bruteForce() { + return bruteForce; + } + + public boolean hnsw() { + return hnsw; + } + } + + public CuVSVectorsWriter( + SegmentWriteState state, + int cuvsWriterThreads, + int intGraphDegree, + int graphDegree, + MergeStrategy mergeStrategy, + IndexType indexType, + CuVSResources resources, + FlatVectorsWriter flatVectorsWriter) + throws IOException { + super(); + this.mergeStrategy = mergeStrategy; + this.indexType = indexType; + this.cuvsWriterThreads = cuvsWriterThreads; + this.intGraphDegree = intGraphDegree; + this.graphDegree = graphDegree; + this.resources = resources; + this.flatVectorsWriter = flatVectorsWriter; + this.infoStream = state.infoStream; + + String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, CUVS_META_CODEC_EXT); + String cagraFileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, CUVS_INDEX_EXT); + + boolean success = false; + try { + meta = state.directory.createOutput(metaFileName, state.context); + cuvsIndex = state.directory.createOutput(cagraFileName, state.context); + CodecUtil.writeIndexHeader( + meta, + CUVS_META_CODEC_NAME, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + CodecUtil.writeIndexHeader( + cuvsIndex, + CUVS_INDEX_CODEC_NAME, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + var encoding = fieldInfo.getVectorEncoding(); + if (encoding != FLOAT32) { + throw new IllegalArgumentException("expected float32, got:" + encoding); + } + var writer = Objects.requireNonNull(flatVectorsWriter.addField(fieldInfo)); + @SuppressWarnings("unchecked") + var flatWriter = (FlatFieldVectorsWriter) writer; + var cuvsFieldWriter = new CuVSFieldWriter(fieldInfo, flatWriter); + fields.add(cuvsFieldWriter); + return writer; + } + + static String indexMsg(int size, int... args) { + StringBuilder sb = new StringBuilder("cagra index params"); + sb.append(": size=").append(size); + sb.append(", intGraphDegree=").append(args[0]); + sb.append(", actualIntGraphDegree=").append(args[1]); + sb.append(", graphDegree=").append(args[2]); + sb.append(", actualGraphDegree=").append(args[3]); + return sb.toString(); + } + + private CagraIndexParams cagraIndexParams(int size) { + if (size < 2) { + // https://github.com/rapidsai/cuvs/issues/666 + throw new IllegalArgumentException("cagra index must be greater than 2"); + } + var minIntGraphDegree = Math.min(intGraphDegree, size - 1); + var minGraphDegree = Math.min(graphDegree, minIntGraphDegree); + // log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); + + return new CagraIndexParams.Builder() + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(minIntGraphDegree) + .withGraphDegree(minGraphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + } + + static long nanosToMillis(long nanos) { + return Duration.ofNanos(nanos).toMillis(); + } + + private void info(String msg) { + if (infoStream.isEnabled(CUVS_COMPONENT)) { + infoStream.message(CUVS_COMPONENT, msg); + } + } + + private void writeCagraIndex(OutputStream os, float[][] vectors) throws Throwable { + if (vectors.length < 2) { + throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); + } + CagraIndexParams params = cagraIndexParams(vectors.length); + long startTime = System.nanoTime(); + var index = + CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(params).build(); + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("Cagra index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); + Path tmpFile = Files.createTempFile(resources.tempDirectory(), "tmpindex", "cag"); + index.serialize(os, tmpFile); + index.destroyIndex(); + } + + private void writeBruteForceIndex(OutputStream os, float[][] vectors) throws Throwable { + BruteForceIndexParams params = + new BruteForceIndexParams.Builder() + .withNumWriterThreads(32) // TODO: Make this configurable later. + .build(); + long startTime = System.nanoTime(); + var index = + BruteForceIndex.newBuilder(resources).withIndexParams(params).withDataset(vectors).build(); + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("bf index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); + index.serialize(os); + index.destroyIndex(); + } + + private void writeHNSWIndex(OutputStream os, float[][] vectors) throws Throwable { + if (vectors.length < 2) { + throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); + } + CagraIndexParams indexParams = cagraIndexParams(vectors.length); + long startTime = System.nanoTime(); + var index = + CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("HNSW index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); + Path tmpFile = Files.createTempFile("tmpindex", "hnsw"); + index.serializeToHNSW(os, tmpFile); + index.destroyIndex(); + } + + @Override + public void flush(int maxDoc, DocMap sortMap) throws IOException { + flatVectorsWriter.flush(maxDoc, sortMap); + for (var field : fields) { + if (sortMap == null) { + writeField(field); + } else { + writeSortingField(field, sortMap); + } + } + } + + private void writeField(CuVSFieldWriter fieldData) throws IOException { + // TODO: Argh! https://github.com/rapidsai/cuvs/issues/698 + float[][] vectors = fieldData.getVectors().toArray(float[][]::new); + writeFieldInternal(fieldData.fieldInfo(), vectors); + } + + private void writeSortingField(CuVSFieldWriter fieldData, Sorter.DocMap sortMap) + throws IOException { + DocsWithFieldSet oldDocsWithFieldSet = fieldData.getDocsWithFieldSet(); + final int[] new2OldOrd = new int[oldDocsWithFieldSet.cardinality()]; // new ord to old ord + + mapOldOrdToNewOrd(oldDocsWithFieldSet, sortMap, null, new2OldOrd, null); + + // TODO: Argh! https://github.com/rapidsai/cuvs/issues/698 + // Also will be replaced with the cuVS merge api + float[][] oldVectors = fieldData.getVectors().toArray(float[][]::new); + float[][] newVectors = new float[oldVectors.length][]; + for (int i = 0; i < oldVectors.length; i++) { + newVectors[i] = oldVectors[new2OldOrd[i]]; + } + writeFieldInternal(fieldData.fieldInfo(), newVectors); + } + + private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws IOException { + if (vectors.length == 0) { + writeEmpty(fieldInfo); + return; + } + long cagraIndexOffset, cagraIndexLength = 0L; + long bruteForceIndexOffset, bruteForceIndexLength = 0L; + long hnswIndexOffset, hnswIndexLength = 0L; + + // workaround for the minimum number of vectors for Cagra + IndexType indexType = + this.indexType.cagra() && vectors.length < MIN_CAGRA_INDEX_SIZE + ? IndexType.BRUTE_FORCE + : this.indexType; + + try { + cagraIndexOffset = cuvsIndex.getFilePointer(); + if (indexType.cagra()) { + try { + var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + writeCagraIndex(cagraIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + // workaround for cuVS issue + indexType = IndexType.BRUTE_FORCE; + } + cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; + } + + bruteForceIndexOffset = cuvsIndex.getFilePointer(); + if (indexType.bruteForce()) { + var bruteForceIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + writeBruteForceIndex(bruteForceIndexOutputStream, vectors); + bruteForceIndexLength = cuvsIndex.getFilePointer() - bruteForceIndexOffset; + } + + hnswIndexOffset = cuvsIndex.getFilePointer(); + if (indexType.hnsw()) { + var hnswIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + if (vectors.length > MIN_CAGRA_INDEX_SIZE) { + try { + writeHNSWIndex(hnswIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + } + } + hnswIndexLength = cuvsIndex.getFilePointer() - hnswIndexOffset; + } + + // StringBuilder sb = new StringBuilder("writeField "); + // sb.append(": fieldInfo.name=").append(fieldInfo.name); + // sb.append(", fieldInfo.number=").append(fieldInfo.number); + // sb.append(", size=").append(vectors.length); + // sb.append(", cagraIndexLength=").append(cagraIndexLength); + // sb.append(", bruteForceIndexLength=").append(bruteForceIndexLength); + // sb.append(", hnswIndexLength=").append(hnswIndexLength); + // log.info(sb.toString()); + + writeMeta( + fieldInfo, + vectors.length, + cagraIndexOffset, + cagraIndexLength, + bruteForceIndexOffset, + bruteForceIndexLength, + hnswIndexOffset, + hnswIndexLength); + } catch (Throwable t) { + handleThrowable(t); + } + } + + private void writeEmpty(FieldInfo fieldInfo) throws IOException { + writeMeta(fieldInfo, 0, 0L, 0L, 0L, 0L, 0L, 0L); + } + + private void writeMeta( + FieldInfo field, + int count, + long cagraIndexOffset, + long cagraIndexLength, + long bruteForceIndexOffset, + long bruteForceIndexLength, + long hnswIndexOffset, + long hnswIndexLength) + throws IOException { + meta.writeInt(field.number); + meta.writeInt(field.getVectorEncoding().ordinal()); + meta.writeInt(distFuncToOrd(field.getVectorSimilarityFunction())); + meta.writeInt(field.getVectorDimension()); + meta.writeInt(count); + meta.writeVLong(cagraIndexOffset); + meta.writeVLong(cagraIndexLength); + meta.writeVLong(bruteForceIndexOffset); + meta.writeVLong(bruteForceIndexLength); + meta.writeVLong(hnswIndexOffset); + meta.writeVLong(hnswIndexLength); + } + + static int distFuncToOrd(VectorSimilarityFunction func) { + for (int i = 0; i < SIMILARITY_FUNCTIONS.size(); i++) { + if (SIMILARITY_FUNCTIONS.get(i).equals(func)) { + return (byte) i; + } + } + throw new IllegalArgumentException("invalid distance function: " + func); + } + + // We currently ignore this, until cuVS supports tiered indices + private static final String CANNOT_GENERATE_CAGRA = + """ + Could not generate an intermediate CAGRA graph because the initial \ + kNN graph contains too many invalid or duplicated neighbor nodes. \ + This error can occur, for example, if too many overflows occur \ + during the norm computation between the dataset vectors\ + """; + + static void handleThrowableWithIgnore(Throwable t, String msg) throws IOException { + if (t.getMessage().contains(msg)) { + return; + } + handleThrowable(t); + } + + /** Copies the vector values into dst. Returns the actual number of vectors copied. */ + private static int getVectorData(FloatVectorValues floatVectorValues, float[][] dst) + throws IOException { + DocsWithFieldSet docsWithField = new DocsWithFieldSet(); + int count = 0; + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { + assert iter.index() == count; + dst[iter.index()] = floatVectorValues.vectorValue(iter.index()); + docsWithField.add(docV); + count++; + } + return docsWithField.cardinality(); + } + + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + flatVectorsWriter.mergeOneField(fieldInfo, mergeState); + try { + final FloatVectorValues mergedVectorValues = + switch (fieldInfo.getVectorEncoding()) { + case BYTE -> throw new AssertionError("bytes not supported"); + case FLOAT32 -> + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + }; + + float[][] vectors = new float[mergedVectorValues.size()][mergedVectorValues.dimension()]; + int ret = getVectorData(mergedVectorValues, vectors); + if (ret < vectors.length) { + vectors = ArrayUtil.copyOfSubArray(vectors, 0, ret); + } + writeFieldInternal(fieldInfo, vectors); + } catch (Throwable t) { + handleThrowable(t); + } + } + + @Override + public void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; + flatVectorsWriter.finish(); + + if (meta != null) { + // write end of fields marker + meta.writeInt(-1); + CodecUtil.writeFooter(meta); + } + if (cuvsIndex != null) { + CodecUtil.writeFooter(cuvsIndex); + } + } + + @Override + public void close() throws IOException { + IOUtils.close(meta, cuvsIndex, flatVectorsWriter); + } + + @Override + public long ramBytesUsed() { + long total = SHALLOW_RAM_BYTES_USED; + for (var field : fields) { + total += field.ramBytesUsed(); + } + return total; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java new file mode 100644 index 000000000000..842fdde65dd2 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.spi.CuVSProvider; +import java.nio.file.Path; + +/*package-private*/ class FilterCuVSProvider implements CuVSProvider { + + private final CuVSProvider delegate; + + FilterCuVSProvider(CuVSProvider delegate) { + this.delegate = delegate; + } + + @Override + public Path nativeLibraryPath() { + return CuVSProvider.TMPDIR; + } + + @Override + public CuVSResources newCuVSResources(Path tempPath) throws Throwable { + return delegate.newCuVSResources(tempPath); + } + + @Override + public BruteForceIndex.Builder newBruteForceIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newBruteForceIndexBuilder(cuVSResources); + } + + @Override + public CagraIndex.Builder newCagraIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newCagraIndexBuilder(cuVSResources); + } + + @Override + public HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newHnswIndexBuilder(cuVSResources); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java new file mode 100644 index 000000000000..eeb7b6895aa3 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.spi.CuVSProvider; +import com.nvidia.cuvs.spi.CuVSServiceProvider; + +/** A provider that creates instances of FilterCuVSProvider. */ +public class FilterCuVSServiceProvider extends CuVSServiceProvider { + @Override + public CuVSProvider get(CuVSProvider builtinProvider) { + return new FilterCuVSProvider(builtinProvider); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java new file mode 100644 index 000000000000..4eb8ed558f70 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.lucene.store.IndexInput; + +/** InputStream for reading from an IndexInput. */ +final class IndexInputInputStream extends InputStream { + + final IndexInput in; + long pos = 0; + final long limit; + + IndexInputInputStream(IndexInput in) { + this.in = in; + this.limit = in.length(); + } + + @Override + public int read() throws IOException { + if (pos >= limit) { + return -1; + } + pos++; + return in.readByte(); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + if (len <= 0) { + return 0; + } + if (pos >= limit) { + return -1; + } + long avail = limit - pos; + if (len > avail) { + len = (int) avail; + } + in.readBytes(b, off, len); + pos += len; + return len; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java new file mode 100644 index 000000000000..ffb2b922e4b5 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.OutputStream; +import org.apache.lucene.store.IndexOutput; + +/** OutputStream for writing into an IndexOutput */ +final class IndexOutputOutputStream extends OutputStream { + + static final int DEFAULT_BUFFER_SIZE = 8192; + + final IndexOutput out; + final int bufferSize; + final byte[] buffer; + int idx; + + IndexOutputOutputStream(IndexOutput out) { + this(out, DEFAULT_BUFFER_SIZE); + } + + IndexOutputOutputStream(IndexOutput out, int bufferSize) { + this.out = out; + this.bufferSize = bufferSize; + this.buffer = new byte[bufferSize]; + } + + @Override + public void write(int b) throws IOException { + buffer[idx] = (byte) b; + idx++; + if (idx == bufferSize) { + flush(); + } + } + + @Override + public void write(byte[] b, int offset, int length) throws IOException { + if (idx != 0) { + flush(); + } + out.writeBytes(b, offset, length); + } + + @Override + public void flush() throws IOException { + out.writeBytes(buffer, 0, idx); + idx = 0; + } + + @Override + public void close() throws IOException { + this.flush(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java new file mode 100644 index 000000000000..caf9566064e9 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHits; + +/** KnnCollector for CuVS */ +/*package-private*/ class PerLeafCuVSKnnCollector implements KnnCollector { + + public List scoreDocs; + public int topK = 0; + public int iTopK = topK; // TODO getter, no setter + public int searchWidth = 1; // TODO getter, no setter + public int results = 0; + + public PerLeafCuVSKnnCollector(int topK, int iTopK, int searchWidth) { + super(); + this.topK = topK; + this.iTopK = iTopK; + this.searchWidth = searchWidth; + scoreDocs = new ArrayList(); + } + + @Override + public boolean earlyTerminated() { + // TODO: may need implementation + return false; + } + + @Override + public void incVisitedCount(int count) { + // TODO: may need implementation + } + + @Override + public long visitedCount() { + // TODO: may need implementation + return 0; + } + + @Override + public long visitLimit() { + // TODO: may need implementation + return 0; + } + + @Override + public int k() { + return topK; + } + + @Override + @SuppressWarnings("cast") + public boolean collect(int docId, float similarity) { + scoreDocs.add(new ScoreDoc(docId, similarity)); + return true; + } + + @Override + public float minCompetitiveSimilarity() { + // TODO: may need implementation + return 0; + } + + @Override + public TopDocs topDocs() { + return new TopDocs( + new TotalHits(scoreDocs.size(), TotalHits.Relation.EQUAL_TO), + scoreDocs.toArray(new ScoreDoc[scoreDocs.size()])); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java new file mode 100644 index 000000000000..86c56b909dd1 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** CuVS based fast vector search */ +package org.apache.lucene.sandbox.vectorsearch; diff --git a/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider b/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider new file mode 100644 index 000000000000..5e7ceba19343 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider \ No newline at end of file diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat new file mode 100644 index 000000000000..666ee726f986 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java new file mode 100644 index 000000000000..a20a49be6f53 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.TreeMap; +import java.util.logging.Logger; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.English; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks; +import org.apache.lucene.tests.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +@SuppressSysoutChecks(bugUrl = "prints info from within cuvs") +public class TestCuVS extends LuceneTestCase { + + protected static Logger log = Logger.getLogger(TestCuVS.class.getName()); + + static final Codec codec = TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); + static IndexSearcher searcher; + static IndexReader reader; + static Directory directory; + + static int DATASET_SIZE_LIMIT = 1000; + static int DIMENSIONS_LIMIT = 2048; + static int NUM_QUERIES_LIMIT = 10; + static int TOP_K_LIMIT = 64; // TODO This fails beyond 64 + + public static float[][] dataset; + + @BeforeClass + public static void beforeClass() throws Exception { + assumeTrue("cuvs not supported", CuVSVectorsFormat.supported()); + directory = newDirectory(); + + RandomIndexWriter writer = + new RandomIndexWriter( + random(), + directory, + newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setCodec(codec) + .setMergePolicy(newTieredMergePolicy())); + + log.info("Merge Policy: " + writer.w.getConfig().getMergePolicy()); + + Random random = random(); + int datasetSize = random.nextInt(DATASET_SIZE_LIMIT) + 1; + int dimensions = random.nextInt(DIMENSIONS_LIMIT) + 1; + dataset = generateDataset(random, datasetSize, dimensions); + for (int i = 0; i < datasetSize; i++) { + Document doc = new Document(); + doc.add(new StringField("id", String.valueOf(i), Field.Store.YES)); + doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES)); + boolean skipVector = + random.nextInt(10) < 0; // disable testing with holes for now, there's some bug. + if (!skipVector + || datasetSize < 100) { // about 10th of the documents shouldn't have a single vector + doc.add(new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new KnnFloatVectorField("vector2", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); + } + + writer.addDocument(doc); + } + + reader = writer.getReader(); + searcher = newSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + if (reader != null) reader.close(); + if (directory != null) directory.close(); + searcher = null; + reader = null; + directory = null; + log.info("Test finished"); + } + + @Test + public void testVectorSearch() throws IOException { + Random random = random(); + int numQueries = random.nextInt(NUM_QUERIES_LIMIT) + 1; + int topK = Math.min(random.nextInt(TOP_K_LIMIT) + 1, dataset.length); + + if (dataset.length < topK) topK = dataset.length; + + float[][] queries = generateQueries(random, dataset[0].length, numQueries); + List> expected = generateExpectedResults(topK, dataset, queries); + + log.info("Dataset size: " + dataset.length + "x" + dataset[0].length); + log.info("Query size: " + numQueries + "x" + queries[0].length); + log.info("TopK: " + topK); + + // Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); + Query query = new KnnFloatVectorQuery("vector", queries[0], topK); + int correct[] = new int[topK]; + for (int i = 0; i < topK; i++) correct[i] = expected.get(0).get(i); + + ScoreDoc[] hits = searcher.search(query, topK).scoreDocs; + log.info("RESULTS: " + Arrays.toString(hits)); + log.info("EXPECTD: " + expected.get(0)); + + for (ScoreDoc hit : hits) { + log.info("\t" + reader.storedFields().document(hit.doc).get("id") + ": " + hit.score); + } + + for (ScoreDoc hit : hits) { + int doc = Integer.parseInt(reader.storedFields().document(hit.doc).get("id")); + assertTrue("Result returned was not in topk*2: " + doc, expected.get(0).contains(doc)); + } + } + + private static float[][] generateQueries(Random random, int dimensions, int numQueries) { + // Generate random query vectors + float[][] queries = new float[numQueries][dimensions]; + for (int i = 0; i < numQueries; i++) { + for (int j = 0; j < dimensions; j++) { + queries[i][j] = random.nextFloat() * 100; + } + } + return queries; + } + + private static float[][] generateDataset(Random random, int datasetSize, int dimensions) { + // Generate a random dataset + float[][] dataset = new float[datasetSize][dimensions]; + for (int i = 0; i < datasetSize; i++) { + for (int j = 0; j < dimensions; j++) { + dataset[i][j] = random.nextFloat() * 100; + } + } + return dataset; + } + + private static List> generateExpectedResults( + int topK, float[][] dataset, float[][] queries) { + List> neighborsResult = new ArrayList<>(); + int dimensions = dataset[0].length; + + for (float[] query : queries) { + Map distances = new TreeMap<>(); + for (int j = 0; j < dataset.length; j++) { + double distance = 0; + for (int k = 0; k < dimensions; k++) { + distance += (query[k] - dataset[j][k]) * (query[k] - dataset[j][k]); + } + distances.put(j, (distance)); + } + + Map sorted = new TreeMap(distances); + log.info("EXPECTED: " + sorted); + + // Sort by distance and select the topK nearest neighbors + List neighbors = + distances.entrySet().stream() + .sorted(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey) + .toList(); + neighborsResult.add( + neighbors.subList( + 0, + Math.min( + topK * 3, + dataset.length))); // generate double the topK results in the expected array + } + + log.info("Expected results generated successfully."); + return neighborsResult; + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java new file mode 100644 index 000000000000..dbbdecf82ec9 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; + +import java.util.List; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.junit.BeforeClass; + +public class TestCuVSVectorsFormat extends BaseKnnVectorsFormatTestCase { + + @BeforeClass + public static void beforeClass() { + assumeTrue("cuvs is not supported", CuVSVectorsFormat.supported()); + } + + @Override + protected Codec getCodec() { + return TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); + // For convenience, to sanitize the test code, one can comment out + // the supported check and use another format, e.g. + // return TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat()); + } + + @Override + protected List supportedVectorEncodings() { + return List.of(VectorEncoding.FLOAT32); + } + + public void testMergeTwoSegsWithASingleDocPerSeg() throws Exception { + float[][] f = new float[][] {randomVector(384), randomVector(384)}; + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + Document doc1 = new Document(); + doc1.add(new StringField("id", "0", Field.Store.NO)); + doc1.add(new KnnFloatVectorField("f", f[0], EUCLIDEAN)); + w.addDocument(doc1); + w.commit(); + Document doc2 = new Document(); + doc2.add(new StringField("id", "1", Field.Store.NO)); + doc2.add(new KnnFloatVectorField("f", f[1], EUCLIDEAN)); + w.addDocument(doc2); + w.flush(); + w.commit(); + + // sanity - verify one doc per leaf + try (DirectoryReader reader = DirectoryReader.open(w)) { + List subReaders = reader.leaves(); + assertEquals(2, subReaders.size()); + assertEquals(1, subReaders.get(0).reader().getFloatVectorValues("f").size()); + assertEquals(1, subReaders.get(1).reader().getFloatVectorValues("f").size()); + } + + // now merge to a single segment + w.forceMerge(1); + + // verify merged content + try (DirectoryReader reader = DirectoryReader.open(w)) { + LeafReader r = getOnlyLeafReader(reader); + FloatVectorValues values = r.getFloatVectorValues("f"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f[1], values.vectorValue(1), 0.0f); + } + } + } + + // Basic test for multiple vectors fields per document + public void testTwoVectorFieldsPerDoc() throws Exception { + float[][] f1 = new float[][] {randomVector(384), randomVector(384)}; + float[][] f2 = new float[][] {randomVector(384), randomVector(384)}; + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + Document doc1 = new Document(); + doc1.add(new StringField("id", "0", Field.Store.NO)); + doc1.add(new KnnFloatVectorField("f1", f1[0], EUCLIDEAN)); + doc1.add(new KnnFloatVectorField("f2", f2[0], EUCLIDEAN)); + w.addDocument(doc1); + Document doc2 = new Document(); + doc2.add(new StringField("id", "1", Field.Store.NO)); + doc2.add(new KnnFloatVectorField("f1", f1[1], EUCLIDEAN)); + doc2.add(new KnnFloatVectorField("f2", f2[1], EUCLIDEAN)); + w.addDocument(doc2); + w.forceMerge(1); + + try (DirectoryReader reader = DirectoryReader.open(w)) { + LeafReader r = getOnlyLeafReader(reader); + FloatVectorValues values = r.getFloatVectorValues("f1"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f1[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f1[1], values.vectorValue(1), 0.0f); + + values = r.getFloatVectorValues("f2"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f2[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f2[1], values.vectorValue(1), 0.0f); + + // opportunistically check boundary condition - search with a 0 topK + var topDocs = r.searchNearestVectors("f1", randomVector(384), 0, null, 10); + assertEquals(0, topDocs.scoreDocs.length); + assertEquals(0, topDocs.totalHits.value()); + } + } + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java new file mode 100644 index 000000000000..e2e2b7600e9d --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.util.ArrayUtil.copyOfSubArray; + +import java.io.IOException; +import java.util.Random; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestIndexOutputOutputStream extends LuceneTestCase { + + public void testBasic() throws IOException { + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + out.write(0x56); + out.write(new byte[] {0x10, 0x11, 0x12, 0x13, 0x14}); + out.close(); + } + + try (var indexIn = dir.openInput("test", IOContext.DEFAULT)) { + var in = new IndexInputInputStream(indexIn); + // assertEquals(0x56, in.read()); + byte[] ba = new byte[6]; + assertEquals(6, in.read(ba)); + assertArrayEquals(new byte[] {0x56, 0x10, 0x11, 0x12, 0x13, 0x14}, ba); + } + } + } + + public void testGetFilePointer() throws IOException { + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + out.write(0x56); + out.write(new byte[] {0x10, 0x11, 0x12}); + assertEquals(4, indexOut.getFilePointer()); + out.close(); + } + } + } + + public void testWithRandom() throws IOException { + byte[] data = new byte[Math.min(atLeast(10_000), 20_000)]; + Random random = random(); + random.nextBytes(data); + + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + int i = 0; + while (i < data.length) { + if (random.nextBoolean()) { + out.write(data[i]); + i++; + } else { + int numBytes = random.nextInt(Math.min(data.length - i, 100)); + out.write(data, i, numBytes); + i += numBytes; + } + } + out.close(); + } + + try (var indexIn = dir.openInput("test", IOContext.DEFAULT)) { + var in = new IndexInputInputStream(indexIn); + int i = 0; + while (i < data.length) { + if (random.nextBoolean()) { + int b = in.read(); + assertEquals(data[i], b); + i++; + } else { + int numBytes = random.nextInt(Math.min(data.length - i, 100)); + byte[] ba = new byte[numBytes]; + in.read(ba, 0, numBytes); + assertArrayEquals(copyOfSubArray(data, i, i + numBytes), ba); + i += numBytes; + } + } + assertEquals(-1, in.read()); + assertEquals(-1, in.read(new byte[2])); + } + } + } +} diff --git a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java index 8883fef22409..9b990b08e475 100644 --- a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java +++ b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java @@ -27,6 +27,7 @@ import org.apache.lucene.spatial3d.geom.PlanetModel.DocValueEncoder; import org.apache.lucene.spatial3d.geom.XYZBounds; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.NumericUtils; class PointInShapeIntersectVisitor implements IntersectVisitor { @@ -67,6 +68,11 @@ public void visit(DocIdSetIterator iterator) throws IOException { adder.add(iterator); } + @Override + public void visit(IntsRef ref) throws IOException { + adder.add(ref); + } + @Override public void visit(int docID, byte[] packedValue) { assert packedValue.length == 12; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java index bbcfc7feb439..496d3b9232dc 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java @@ -17,8 +17,10 @@ package org.apache.lucene.search.suggest.document; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.internal.hppc.IntHashSet; @@ -230,7 +232,7 @@ private static Automaton toContextAutomaton( if (matchAllContexts || contexts.size() == 0) { return Operations.concatenate(matchAllAutomaton, sep); } else { - Automaton contextsAutomaton = null; + List automataList = new ArrayList<>(); for (Map.Entry entry : contexts.entrySet()) { final ContextMetaData contextMetaData = entry.getValue(); final IntsRef ref = entry.getKey(); @@ -239,12 +241,9 @@ private static Automaton toContextAutomaton( contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton); } contextAutomaton = Operations.concatenate(contextAutomaton, sep); - if (contextsAutomaton == null) { - contextsAutomaton = contextAutomaton; - } else { - contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton); - } + automataList.add(contextAutomaton); } + Automaton contextsAutomaton = Operations.union(automataList); return contextsAutomaton; } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java index 01eb834a6b0b..cf7d5f8e745b 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java @@ -468,6 +468,38 @@ public void testMultiContextQuery() throws Exception { iw.close(); } + @Test + public void testBigNumberOfContextsQuery() throws Exception { + Analyzer analyzer = new MockAnalyzer(random()); + RandomIndexWriter iw = + new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field")); + for (int i = 1; i < 1001; i++) { + Document document = new Document(); + document.add( + new ContextSuggestField("suggest_field", "suggestion" + i, 1001 - i, "group" + i)); + iw.addDocument(document); + } + iw.commit(); + + DirectoryReader reader = iw.getReader(); + SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader); + ContextQuery query = + new ContextQuery(new PrefixCompletionQuery(analyzer, new Term("suggest_field", "sugg"))); + for (int i = 1; i < 1001; i++) { + query.addContext("group" + i, 1); + } + TopSuggestDocs suggest = suggestIndexSearcher.suggest(query, 5, false); + assertSuggestions( + suggest, + new Entry("suggestion1", "group1", 1000), + new Entry("suggestion2", "group2", 999), + new Entry("suggestion3", "group3", 998), + new Entry("suggestion4", "group4", 997), + new Entry("suggestion5", "group5", 996)); + reader.close(); + iw.close(); + } + @Test public void testAllContextQuery() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingLiveDocsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingLiveDocsFormat.java index f45ea821a555..e2152e45aa58 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingLiveDocsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingLiveDocsFormat.java @@ -24,6 +24,7 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; /** Just like the default live docs format but with additional asserts. */ public class AssertingLiveDocsFormat extends LiveDocsFormat { @@ -88,6 +89,12 @@ public int length() { return in.length(); } + @Override + public void applyMask(FixedBitSet bitSet, int offset) { + assert offset >= 0; + in.applyMask(bitSet, offset); + } + @Override public String toString() { return "Asserting(" + in + ")"; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java index 297c1b777f53..c2aa7ff0e4de 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java @@ -275,6 +275,9 @@ public void testMergeStability() throws Exception { new IndexWriterConfig(new MockAnalyzer(random())) .setUseCompoundFile(false) .setMergePolicy(mp); + if (VERBOSE) { + cfg.setInfoStream(System.out); + } IndexWriter w = new IndexWriter(dir, cfg); final int numDocs = atLeast(500); for (int i = 0; i < numDocs; ++i) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 752f21ea5d7a..ed1a76133968 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -124,6 +124,14 @@ protected void addRandomFields(Document doc) { } } + @Override + protected boolean mergeIsStable() { + // suppress this test from base class: merges for knn graphs are not stable due to connected + // components + // logic + return false; + } + private int getVectorsMaxDimensions(String fieldName) { return Codec.getDefault().knnVectorsFormat().getMaxDimensions(fieldName); } @@ -288,6 +296,7 @@ public KnnVectorsFormat knnVectorsFormat() { } public void testMergingWithDifferentByteKnnFields() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); try (var dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(); Codec codec = getCodec(); @@ -986,6 +995,7 @@ public void testFloatVectorScorerIteration() throws Exception { } public void testByteVectorScorerIteration() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); if (random().nextBoolean()) { iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); @@ -1073,6 +1083,7 @@ public void testEmptyFloatVectorData() throws Exception { } public void testEmptyByteVectorData() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { var doc1 = new Document(); @@ -1104,11 +1115,16 @@ protected VectorSimilarityFunction randomSimilarity() { } /** - * This method is overrideable since old codec versions only support {@link - * VectorEncoding#FLOAT32}. + * The vector encodings supported by the format. Defaults to all VectorEncoding.values(). Override + * if the format only supports a subset of these encodings. */ + protected List supportedVectorEncodings() { + return Arrays.stream(VectorEncoding.values()).toList(); + } + protected VectorEncoding randomVectorEncoding() { - return VectorEncoding.values()[random().nextInt(VectorEncoding.values().length)]; + var encodings = supportedVectorEncodings().toArray(VectorEncoding[]::new); + return encodings[random().nextInt(encodings.length)]; } public void testIndexedValueNotAliased() throws Exception { @@ -1185,6 +1201,7 @@ public void testSortedIndex() throws Exception { } public void testSortedIndexBytes() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); String fieldName = "field"; @@ -1353,6 +1370,7 @@ public void testRandom() throws Exception { * back consistently. */ public void testRandomBytes() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); if (random().nextBoolean()) { iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); @@ -1867,6 +1885,7 @@ public void testVectorValuesReportCorrectDocs() throws Exception { } public void testMismatchedFields() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); Directory dir1 = newDirectory(); IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); Document doc = new Document(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java index 64ca9cca35f0..15c9c324732c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java @@ -1388,10 +1388,6 @@ private void verifyEnum( PostingsEnum pe2 = termsEnum.postings(null, flags); FixedBitSet set1 = new FixedBitSet(1024); FixedBitSet set2 = new FixedBitSet(1024); - FixedBitSet acceptDocs = new FixedBitSet(maxDoc); - for (int i = 0; i < maxDoc; i += 2) { - acceptDocs.set(i); - } while (true) { pe1.nextDoc(); @@ -1400,11 +1396,9 @@ private void verifyEnum( int offset = TestUtil.nextInt(random, Math.max(0, pe1.docID() - set1.length()), pe1.docID()); int upTo = offset + random.nextInt(set1.length()); - pe1.intoBitSet(acceptDocs, upTo, set1, offset); + pe1.intoBitSet(upTo, set1, offset); for (int d = pe2.docID(); d < upTo; d = pe2.nextDoc()) { - if (acceptDocs.get(d)) { - set2.set(d - offset); - } + set2.set(d - offset); } assertEquals(set1, set2); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java index 7200d4b5f4dc..9717f738e82e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java @@ -24,7 +24,6 @@ import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; /** Wraps a Scorer with additional checks */ @@ -196,11 +195,10 @@ public long cost() { } @Override - public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset) - throws IOException { + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { assert docID() != -1; assert offset <= docID(); - in.intoBitSet(acceptDocs, upTo, bitSet, offset); + in.intoBitSet(upTo, bitSet, offset); assert docID() >= upTo; } }; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java index 6defa5eb8c7a..f00b3811d0c4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java @@ -771,6 +771,12 @@ public void testSliceOutOfBounds() throws Exception { slice.slice("slice3sub", 1, len / 2); }); + expectThrows( + IllegalArgumentException.class, + () -> { + i.slice("slice4", Long.MAX_VALUE - 1, 10); + }); + i.close(); } } @@ -1200,6 +1206,9 @@ public void testSliceOfSlice() throws Exception { slice1.seek(TestUtil.nextLong(random(), 0, slice1.length())); for (int j = 0; j < slice1.length(); j += 16) { IndexInput slice2 = slice1.slice("slice2", j, num - i - j); + if (random().nextBoolean()) { + slice2 = slice2.clone(); // clone shouldn't impact slice data + } assertEquals(0, slice2.getFilePointer()); assertEquals(num - i - j, slice2.length()); byte[] data = new byte[num]; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SerialIOCountingDirectory.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SerialIOCountingDirectory.java index 1b4234c3d79f..4d3c233257c8 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SerialIOCountingDirectory.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SerialIOCountingDirectory.java @@ -194,7 +194,7 @@ public IndexInput slice(String sliceDescription, long offset, long length) throw public IndexInput slice( String sliceDescription, long offset, long length, ReadAdvice readAdvice) throws IOException { - if (offset < 0 || offset + length > sliceLength) { + if ((length | offset) < 0 || length > sliceLength - offset) { throw new IllegalArgumentException(); } IndexInput clone = in.clone(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseDocIdSetTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseDocIdSetTestCase.java index 2c1dfc72a31a..c74757b542ca 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseDocIdSetTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseDocIdSetTestCase.java @@ -24,6 +24,7 @@ import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; /** Base test class for {@link DocIdSet}s. */ public abstract class BaseDocIdSetTestCase extends LuceneTestCase { @@ -196,4 +197,71 @@ private long ramBytesUsed(DocIdSet set, int length) throws IOException { long bytes2 = RamUsageTester.ramUsed(dummy); return bytes1 - bytes2; } + + public void testIntoBitSet() throws IOException { + Random random = random(); + final int numBits = TestUtil.nextInt(random, 100, 1 << 20); + // test various random sets with various load factors + for (float percentSet : new float[] {0f, 0.0001f, random.nextFloat(), 0.9f, 1f}) { + final BitSet set = randomSet(numBits, percentSet); + final T copy = copyOf(set, numBits); + int from = TestUtil.nextInt(random(), 0, numBits - 1); + int to = TestUtil.nextInt(random(), from, numBits + 5); + FixedBitSet actual = new FixedBitSet(to - from); + DocIdSetIterator it1 = copy.iterator(); + if (it1 == null) { + continue; + } + int fromDoc = it1.advance(from); + // No docs to set + it1.intoBitSet(from, actual, from); + assertTrue(actual.scanIsEmpty()); + assertEquals(fromDoc, it1.docID()); + + // Now actually set some bits + it1.intoBitSet(to, actual, from); + FixedBitSet expected = new FixedBitSet(to - from); + DocIdSetIterator it2 = copy.iterator(); + for (int doc = it2.advance(from); doc < to; doc = it2.nextDoc()) { + expected.set(doc - from); + } + assertEquals(expected, actual); + // Check if docID() / nextDoc() return the same value after #intoBitSet has been called. + assertEquals(it2.docID(), it1.docID()); + if (it2.docID() != DocIdSetIterator.NO_MORE_DOCS) { + assertEquals(it2.nextDoc(), it1.nextDoc()); + } + } + } + + public void testIntoBitSetBoundChecks() throws IOException { + final BitSet set = new BitSet(); + set.set(20); + set.set(42); + final T copy = copyOf(set, 256); + int from = TestUtil.nextInt(random(), 0, 20); + int to = TestUtil.nextInt(random(), 43, 256); + int offset = TestUtil.nextInt(random(), 0, from); + FixedBitSet dest1 = new FixedBitSet(42 - offset + 1); + DocIdSetIterator it1 = copy.iterator(); + it1.advance(from); + // This call is legal, since all "set" bits are in the range + it1.intoBitSet(to, dest1, offset); + for (int i = 0; i < dest1.length(); ++i) { + assertEquals(offset + i == 20 || offset + i == 42, dest1.get(i)); + } + + FixedBitSet dest2 = new FixedBitSet(42 - offset); + DocIdSetIterator it2 = copy.iterator(); + it2.advance(from); + // This call is not legal, since there is one bit that is set beyond the end of the target bit + // set + expectThrows(Throwable.class, () -> it2.intoBitSet(to, dest2, offset)); + + FixedBitSet dest3 = new FixedBitSet(42 - offset + 1); + DocIdSetIterator it3 = copy.iterator(); + it3.advance(from); + // This call is not legal, since offset is greater than the current doc + expectThrows(Throwable.class, () -> it3.intoBitSet(to, dest3, 21)); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java index 84fa120b88b1..ff4eb908e9b0 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java @@ -1096,11 +1096,6 @@ private static void configureRandom(Random r, MergePolicy mergePolicy) { public static TieredMergePolicy newTieredMergePolicy(Random r) { TieredMergePolicy tmp = new TieredMergePolicy(); - if (rarely(r)) { - tmp.setMaxMergeAtOnce(TestUtil.nextInt(r, 2, 9)); - } else { - tmp.setMaxMergeAtOnce(TestUtil.nextInt(r, 10, 50)); - } if (rarely(r)) { tmp.setMaxMergedSegmentMB(0.2 + r.nextDouble() * 2.0); } else { @@ -1235,11 +1230,6 @@ public static void maybeChangeLiveIndexWriterConfig(Random r, LiveIndexWriterCon } } else if (mp instanceof TieredMergePolicy) { TieredMergePolicy tmp = (TieredMergePolicy) mp; - if (rarely(r)) { - tmp.setMaxMergeAtOnce(TestUtil.nextInt(r, 2, 9)); - } else { - tmp.setMaxMergeAtOnce(TestUtil.nextInt(r, 10, 50)); - } if (rarely(r)) { tmp.setMaxMergedSegmentMB(0.2 + r.nextDouble() * 2.0); } else { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java index 6715edecc166..c2f5d886e3c0 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java @@ -1453,7 +1453,6 @@ public static void reduceOpenFiles(IndexWriter w) { lmp.setMergeFactor(Math.min(5, lmp.getMergeFactor())); } else if (mp instanceof TieredMergePolicy) { TieredMergePolicy tmp = (TieredMergePolicy) mp; - tmp.setMaxMergeAtOnce(Math.min(5, tmp.getMaxMergeAtOnce())); tmp.setSegmentsPerTier(Math.min(5, tmp.getSegmentsPerTier())); } MergeScheduler ms = w.getConfig().getMergeScheduler(); diff --git a/settings.gradle b/settings.gradle index f4ee13243ca6..8543bab1619f 100644 --- a/settings.gradle +++ b/settings.gradle @@ -26,8 +26,8 @@ pluginManagement { plugins { id "org.gradle.toolchains.foojay-resolver-convention" version "0.8.0" - id 'com.gradle.enterprise' version '3.15.1' - id 'com.gradle.common-custom-user-data-gradle-plugin' version '1.11.3' + id 'com.gradle.develocity' version '3.18.2' + id 'com.gradle.common-custom-user-data-gradle-plugin' version '2.0.2' } dependencyResolutionManagement { diff --git a/versions.lock b/versions.lock index 26de44f99e2d..a98d277acf2c 100644 --- a/versions.lock +++ b/versions.lock @@ -4,15 +4,17 @@ "main_dependencies" : { "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1" : "fa9ef26b,refs=4", "com.ibm.icu:icu4j:74.2" : "47ea4550,refs=6", - "commons-codec:commons-codec:1.13" : "e9962aab,refs=4", + "com.nvidia.cuvs:cuvs-java:25.02.0" : "0129b4f0,refs=6", + "commons-codec:commons-codec:1.17.2" : "e9962aab,refs=4", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.1" : "fa9ef26b,refs=4", "net.sf.jopt-simple:jopt-simple:5.0.4" : "85a1e4c6,refs=2", "net.sourceforge.nekohtml:nekohtml:1.9.17" : "5ce8cdc6,refs=2", "org.antlr:antlr4-runtime:4.11.1" : "d9953130,refs=4", "org.apache.commons:commons-compress:1.19" : "5ce8cdc6,refs=2", + "org.apache.commons:commons-lang3:3.17.0" : "0129b4f0,refs=6", "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", - "org.apache.opennlp:opennlp-tools:2.3.2" : "2f760bab,refs=4", + "org.apache.opennlp:opennlp-tools:2.5.3" : "2f760bab,refs=4", "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", "org.carrot2:morfologik-polish:2.1.9" : "fe494320,refs=3", "org.carrot2:morfologik-stemming:2.1.9" : "79af844b,refs=4", @@ -22,7 +24,7 @@ "org.ow2.asm:asm:9.6" : "d9953130,refs=4", "org.ow2.asm:asm-commons:9.6" : "d9953130,refs=4", "org.ow2.asm:asm-tree:9.6" : "d9953130,refs=4", - "org.slf4j:slf4j-api:1.7.36" : "2f760bab,refs=4", + "org.slf4j:slf4j-api:2.0.16" : "2f760bab,refs=4", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "fe494320,refs=3", "xerces:xercesImpl:2.12.0" : "5ce8cdc6,refs=2" }, @@ -46,7 +48,8 @@ "com.google.j2objc:j2objc-annotations:1.3" : "6897bc09,refs=38", "com.google.protobuf:protobuf-java:3.19.2" : "6897bc09,refs=38", "com.ibm.icu:icu4j:74.2" : "ffa00415,refs=8", - "commons-codec:commons-codec:1.13" : "733734f0,refs=6", + "com.nvidia.cuvs:cuvs-java:25.02.0" : "7ac6f8d9,refs=9", + "commons-codec:commons-codec:1.17.2" : "733734f0,refs=6", "io.github.java-diff-utils:java-diff-utils:4.0" : "6897bc09,refs=38", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", "javax.inject:javax.inject:1" : "6897bc09,refs=38", @@ -55,8 +58,9 @@ "net.sourceforge.nekohtml:nekohtml:1.9.17" : "6f16ff86,refs=2", "org.antlr:antlr4-runtime:4.11.1" : "6fbc4021,refs=5", "org.apache.commons:commons-compress:1.19" : "6f16ff86,refs=2", + "org.apache.commons:commons-lang3:3.17.0" : "7ac6f8d9,refs=9", "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", - "org.apache.opennlp:opennlp-tools:2.3.2" : "b91715f0,refs=6", + "org.apache.opennlp:opennlp-tools:2.5.3" : "b91715f0,refs=6", "org.assertj:assertj-core:3.21.0" : "b7ba1646,refs=2", "org.carrot2:morfologik-fsa:2.1.9" : "e077a675,refs=8", "org.carrot2:morfologik-polish:2.1.9" : "cb00cecf,refs=5", @@ -73,12 +77,38 @@ "org.ow2.asm:asm-commons:9.6" : "6fbc4021,refs=5", "org.ow2.asm:asm-tree:9.6" : "6fbc4021,refs=5", "org.pcollections:pcollections:3.1.4" : "6897bc09,refs=38", - "org.slf4j:slf4j-api:1.7.36" : "b91715f0,refs=6", + "org.slf4j:slf4j-api:2.0.16" : "b91715f0,refs=6", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "cb00cecf,refs=5", "xerces:xercesImpl:2.12.0" : "6f16ff86,refs=2" } }, "because" : { + "0129b4f0" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "152d9f78" : [ { "configuration" : "annotationProcessor", @@ -405,6 +435,44 @@ "projectPath" : ":lucene:analysis:morfologik" } ], + "7ac6f8d9" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "85a1e4c6" : [ { "configuration" : "compileClasspath", @@ -932,4 +1000,4 @@ } ] } -} \ No newline at end of file +} diff --git a/versions.toml b/versions.toml index 80dc51f39bf2..7688f235a691 100644 --- a/versions.toml +++ b/versions.toml @@ -2,8 +2,10 @@ antlr = "4.11.1" asm = "9.6" assertj = "3.21.0" -commons-codec = "1.13" +commons-codec = "1.17.2" commons-compress = "1.19" +commons-lang3 = "3.17.0" +cuvs = "25.02.0" ecj = "3.36.0" errorprone = "2.18.0" flexmark = "0.61.24" @@ -25,7 +27,7 @@ minJava = "21" morfologik = "2.1.9" morfologik-ukrainian = "4.9.1" nekohtml = "1.9.17" -opennlp = "2.3.2" +opennlp = "2.5.3" procfork = "1.0.6" randomizedtesting = "2.8.1" rat = "0.14" @@ -42,6 +44,8 @@ asm-core = { module = "org.ow2.asm:asm", version.ref = "asm" } assertj = { module = "org.assertj:assertj-core", version.ref = "assertj" } commons-codec = { module = "commons-codec:commons-codec", version.ref = "commons-codec" } commons-compress = { module = "org.apache.commons:commons-compress", version.ref = "commons-compress" } +commons-lang3 = { module = "org.apache.commons:commons-lang3", version.ref = "commons-lang3" } +cuvs = { module = "com.nvidia.cuvs:cuvs-java", version.ref = "cuvs" } ecj = { module = "org.eclipse.jdt:ecj", version.ref = "ecj" } errorprone = { module = "com.google.errorprone:error_prone_core", version.ref = "errorprone" } flexmark-core = { module = "com.vladsch.flexmark:flexmark", version.ref = "flexmark" }