From b8a1162738e87ea75cbc0fd186f29f4738d44a55 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Tue, 7 Jan 2025 21:17:26 +0530 Subject: [PATCH 01/88] Initial cut of CuVS into Lucene as a Codec in sandbox --- build-tools/build-infra/build.gradle | 1 + gradle/globals.gradle | 1 + lucene/sandbox/build.gradle | 7 + lucene/sandbox/src/java/module-info.java | 5 +- .../vectorsearch/CagraFieldVectorsWriter.java | 35 ++ .../sandbox/vectorsearch/CuVSCodec.java | 31 ++ .../sandbox/vectorsearch/CuVSIndex.java | 56 +++ .../vectorsearch/CuVSKnnFloatVectorQuery.java | 33 ++ .../sandbox/vectorsearch/CuVSSegmentFile.java | 43 +++ .../vectorsearch/CuVSVectorsFormat.java | 70 ++++ .../vectorsearch/CuVSVectorsReader.java | 310 ++++++++++++++++ .../vectorsearch/CuVSVectorsWriter.java | 339 ++++++++++++++++++ .../vectorsearch/PerLeafCuVSKnnCollector.java | 74 ++++ .../vectorsearch/SegmentInputStream.java | 90 +++++ .../lucene/sandbox/vectorsearch/Util.java | 142 ++++++++ .../sandbox/vectorsearch/package-info.java | 1 + .../sandbox/vectorsearch/IntegrationTest.java | 201 +++++++++++ versions.toml | 6 + 18 files changed, 1444 insertions(+), 1 deletion(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java diff --git a/build-tools/build-infra/build.gradle b/build-tools/build-infra/build.gradle index 5cb1426cba97..34d71f7509d3 100644 --- a/build-tools/build-infra/build.gradle +++ b/build-tools/build-infra/build.gradle @@ -22,6 +22,7 @@ plugins { } repositories { + mavenLocal() mavenCentral() } diff --git a/gradle/globals.gradle b/gradle/globals.gradle index bcab6461ea91..25bfddc9bebf 100644 --- a/gradle/globals.gradle +++ b/gradle/globals.gradle @@ -22,6 +22,7 @@ allprojects { // Repositories to fetch dependencies from. repositories { + mavenLocal() mavenCentral() } diff --git a/lucene/sandbox/build.gradle b/lucene/sandbox/build.gradle index 72762fe1c3d2..6d225fd78ba4 100644 --- a/lucene/sandbox/build.gradle +++ b/lucene/sandbox/build.gradle @@ -19,9 +19,16 @@ apply plugin: 'java-library' description = 'Various third party contributions and new ideas' +repositories { + mavenLocal() +} + + dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:queries') moduleApi project(':lucene:facet') moduleTestImplementation project(':lucene:test-framework') + moduleImplementation deps.commons.lang3 + moduleImplementation deps.cuvs } diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index f40a05af433a..b2d45adf4d30 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -20,7 +20,10 @@ requires org.apache.lucene.core; requires org.apache.lucene.queries; requires org.apache.lucene.facet; - + requires java.logging; + requires com.nvidia.cuvs; + requires org.apache.commons.lang3; + exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; exports org.apache.lucene.sandbox.codecs.quantization; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java new file mode 100644 index 000000000000..21c088bd84f8 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -0,0 +1,35 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.index.FieldInfo; + +public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { + + public final String fieldName; + public final ConcurrentHashMap vectors = new ConcurrentHashMap(); + public int fieldVectorDimension = -1; + + public CagraFieldVectorsWriter(FieldInfo fieldInfo) { + this.fieldName = fieldInfo.getName(); + this.fieldVectorDimension = fieldInfo.getVectorDimension(); + } + + @Override + public long ramBytesUsed() { + return fieldName.getBytes().length + Integer.BYTES + (vectors.size() * fieldVectorDimension * Float.BYTES); + } + + @Override + public void addValue(int docID, float[] vectorValue) throws IOException { + vectors.put(docID, vectorValue); + } + + @Override + public float[] copyValue(float[] vectorValue) { + throw new UnsupportedOperationException(); + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java new file mode 100644 index 000000000000..448803bb7fc4 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -0,0 +1,31 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; + + +public class CuVSCodec extends FilterCodec { + + public CuVSCodec() { + this("CuVSCodec", new Lucene101Codec()); + } + + public CuVSCodec(String name, Codec delegate) { + super(name, delegate); + setKnnFormat(new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE)); + } + + KnnVectorsFormat knnFormat = null; + + @Override + public KnnVectorsFormat knnVectorsFormat() { + return knnFormat; + } + + public void setKnnFormat(KnnVectorsFormat format) { + this.knnFormat = format; + } +} \ No newline at end of file diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java new file mode 100644 index 000000000000..1878b6c236bc --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -0,0 +1,56 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.util.List; +import java.util.Objects; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.CagraIndex; + +public class CuVSIndex { + private final CagraIndex cagraIndex; + private final BruteForceIndex bruteforceIndex; + private final List mapping; + private final List vectors; + private final int maxDocs; + + private final String fieldName; + private final String segmentName; + + public CuVSIndex(String segmentName, String fieldName, CagraIndex cagraIndex, List mapping, List vectors, int maxDocs, BruteForceIndex bruteforceIndex) { + this.cagraIndex = Objects.requireNonNull(cagraIndex); + this.bruteforceIndex = Objects.requireNonNull(bruteforceIndex); + this.mapping = Objects.requireNonNull(mapping); + this.vectors = Objects.requireNonNull(vectors); + this.fieldName = Objects.requireNonNull(fieldName); + this.segmentName = Objects.requireNonNull(segmentName); + this.maxDocs = Objects.requireNonNull(maxDocs); + } + + public CagraIndex getCagraIndex() { + return cagraIndex; + } + + public BruteForceIndex getBruteforceIndex() { + return bruteforceIndex; + } + + public List getMapping() { + return mapping; + } + + public String getFieldName() { + return fieldName; + } + + public List getVectors() { + return vectors; + } + + public String getSegmentName() { + return segmentName; + } + + public int getMaxDocs() { + return maxDocs; + } +} \ No newline at end of file diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java new file mode 100644 index 000000000000..1bbae88c5630 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -0,0 +1,33 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; + +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.knn.KnnCollectorManager; +import org.apache.lucene.util.Bits; + +public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { + + final private int iTopK; + final private int searchWidth; + + public CuVSKnnFloatVectorQuery(String field, float[] target, int k, int iTopK, int searchWidth) { + super(field, target, k); + this.iTopK = iTopK; + this.searchWidth = searchWidth; + } + + @Override + protected TopDocs approximateSearch(LeafReaderContext context, Bits acceptDocs, int visitedLimit, KnnCollectorManager knnCollectorManager) throws IOException { + + PerLeafCuVSKnnCollector results = new PerLeafCuVSKnnCollector(k, iTopK, searchWidth); + + LeafReader reader = context.reader(); + reader.searchNearestVectors(field, this.getTargetCopy(), results, null); + return results.topDocs(); + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java new file mode 100644 index 000000000000..9ca0d63ba087 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -0,0 +1,43 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; +import java.util.zip.Deflater; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +public class CuVSSegmentFile implements AutoCloseable{ + final private ZipOutputStream zos; + + private Set filesAdded = new HashSet(); + + public CuVSSegmentFile(OutputStream out) { + zos = new ZipOutputStream(out); + zos.setLevel(Deflater.NO_COMPRESSION); + } + + protected Logger log = Logger.getLogger(getClass().getName()); + + public void addFile(String name, byte[] bytes) throws IOException { + log.info("Writing the file: " + name + ", size="+bytes.length + ", space remaining: "+new File("/").getFreeSpace()); + ZipEntry indexFileZipEntry = new ZipEntry(name); + zos.putNextEntry(indexFileZipEntry); + zos.write(bytes, 0, bytes.length); + zos.closeEntry(); + filesAdded.add(name); + } + + public Set getFilesAdded() { + return Collections.unmodifiableSet(filesAdded); + } + + @Override + public void close() throws IOException { + zos.close(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java new file mode 100644 index 000000000000..c17b5258c9d5 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -0,0 +1,70 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; + +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; + +import com.nvidia.cuvs.CuVSResources; + +public class CuVSVectorsFormat extends KnnVectorsFormat { + + public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; + public static final String VECTOR_DATA_EXTENSION = "cag"; + public static final String META_EXTENSION = "cagmf"; + public static final int VERSION_CURRENT = 0; + public final int maxDimensions = 4096; + public final int cuvsWriterThreads; + public final int intGraphDegree; + public final int graphDegree; + public MergeStrategy mergeStrategy; + public static CuVSResources resources; + + public CuVSVectorsFormat() { + super("CuVSVectorsFormat"); + this.cuvsWriterThreads = 1; + this.intGraphDegree = 128; + this.graphDegree = 64; + try { + resources = new CuVSResources(); + } catch (Throwable e) { + e.printStackTrace(); + } + } + + public CuVSVectorsFormat(int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) { + super("CuVSVectorsFormat"); + this.mergeStrategy = mergeStrategy; + this.cuvsWriterThreads = cuvsWriterThreads; + this.intGraphDegree = intGraphDegree; + this.graphDegree = graphDegree; + try { + resources = new CuVSResources(); + } catch (Throwable e) { + e.printStackTrace(); + } + } + + @Override + public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new CuVSVectorsWriter(state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); + } + + @Override + public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException { + try { + return new CuVSVectorsReader(state, resources); + } catch (Throwable e) { + e.printStackTrace(); + } + return null; + } + + @Override + public int getMaxDimensions(String fieldName) { + return maxDimensions; + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java new file mode 100644 index 000000000000..cac870afec6c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -0,0 +1,310 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.lang.StackWalker.StackFrame; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.logging.Logger; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import org.apache.commons.lang3.SerializationUtils; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.TopKnnCollector; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceQuery; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraQuery; +import com.nvidia.cuvs.CagraSearchParams; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.HnswIndexParams; + +public class CuVSVectorsReader extends KnnVectorsReader { + + protected Logger log = Logger.getLogger(getClass().getName()); + + IndexInput vectorDataReader = null; + public String fileName = null; + public byte[] indexFileBytes; + public int[] docIds; + public float[] vectors; + public SegmentReadState segmentState = null; + public int indexFilePayloadSize = 0; + public long initialFilePointerLoc = 0; + public SegmentInputStream segmentInputStream; + + // Field to List of Indexes + public Map> cuvsIndexes; + + private CuVSResources resources; + + public CuVSVectorsReader(SegmentReadState state, CuVSResources resources) throws Throwable { + + segmentState = state; + this.resources = resources; + + fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, + CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + + vectorDataReader = segmentState.directory.openInput(fileName, segmentState.context); + CodecUtil.readIndexHeader(vectorDataReader); + + initialFilePointerLoc = vectorDataReader.getFilePointer(); + indexFilePayloadSize = (int)vectorDataReader.length() - (int)initialFilePointerLoc; //vectorMetaReader.readInt(); + segmentInputStream = new SegmentInputStream(vectorDataReader, indexFilePayloadSize, initialFilePointerLoc); + log.info("payloadSize: " + indexFilePayloadSize); + log.info("initialFilePointerLoc: " + initialFilePointerLoc); + + List stackTrace = StackWalker.getInstance().walk(this::getStackTrace); + + boolean isMergeCase = false; + for (StackFrame s : stackTrace) { + if (s.toString().startsWith("org.apache.lucene.index.IndexWriter.merge")) { + isMergeCase = true; + log.info("Reader opening on merge call"); + break; + } + } + + log.info("Source of this segment "+segmentState.segmentSuffix+" is " + segmentState.segmentInfo.getDiagnostics().get(IndexWriter.SOURCE)); + log.info("Loading for " + segmentState.segmentInfo.name + ", mergeCase? " + isMergeCase); + //if (!isMergeCase) { nocommit: TODO: don't load the cagra index for merge case. + log.info("Not the merge case, hence loading for " + segmentState.segmentInfo.name); + this.cuvsIndexes = loadCuVSIndex(getIndexInputStream(), isMergeCase); + //} + } + + @SuppressWarnings({"unchecked"}) + private Map> loadCuVSIndex(ZipInputStream zis, boolean isMergeCase) throws Throwable { + Map> ret = new HashMap>(); + Map cagraIndexes = new HashMap(); + Map bruteforceIndexes = new HashMap(); + Map hnswIndexes = new HashMap(); + Map> mappings = new HashMap>(); + Map> vectors = new HashMap>(); + + Map maxDocs = null; // map of segment, maxDocs + ZipEntry ze; + while ((ze = zis.getNextEntry()) != null) { + String entry = ze.getName(); + + String segmentField = entry.split("\\.")[0]; + String extension = entry.split("\\.")[1]; + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[1024]; + int len = 0; + while ((len = zis.read(buffer)) != -1) { + baos.write(buffer, 0, len); + } + + switch (extension) { + case "meta": { + maxDocs = (Map) SerializationUtils.deserialize(baos.toByteArray()); // nocommit use IOUtils + break; + } + case "vec": { + vectors.put(segmentField, (List) SerializationUtils.deserialize(baos.toByteArray())); // nocommit use IOUtils + break; + } + case "map": { + List map = (List) SerializationUtils.deserialize(baos.toByteArray()); // nocommit use IOUtils + mappings.put(segmentField, map); + break; + } + case "cag": { + cagraIndexes.put(segmentField, new CagraIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .build()); + break; + } + case "bf": { + bruteforceIndexes.put(segmentField, new BruteForceIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .build()); + break; + } + case "hnsw": { + HnswIndexParams indexParams = new HnswIndexParams.Builder(resources) + .build(); + hnswIndexes.put(segmentField, new HnswIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .withIndexParams(indexParams) + .build()); + break; + } + } + } + + log.info("Loading cuvsIndexes from segment: " + segmentState.segmentInfo.name); + log.info("Diagnostics for this segment: " + segmentState.segmentInfo.getDiagnostics()); + log.info("Loading map of cagraIndexes: " + cagraIndexes); + log.info("Loading vectors: " + vectors); + log.info("Loading mapping: " + mappings); + + for (String segmentField: cagraIndexes.keySet()) { + log.info("Loading segmentField: " + segmentField); + String segment = segmentField.split("/")[0]; + String field = segmentField.split("/")[1]; + CuVSIndex cuvsIndex = new CuVSIndex(segment, field, cagraIndexes.get(segmentField), mappings.get(segmentField), vectors.get(segmentField), maxDocs.get(segment), bruteforceIndexes.get(segmentField)); + List listOfIndexes = ret.containsKey(field)? ret.get(field): new ArrayList(); + listOfIndexes.add(cuvsIndex); + ret.put(field, listOfIndexes); + } + return ret; + } + + public List getStackTrace(Stream stackFrameStream) { + return stackFrameStream.collect(Collectors.toList()); + } + + public ZipInputStream getIndexInputStream() throws IOException { + segmentInputStream.reset(); + return new ZipInputStream(segmentInputStream); + } + + @Override + public void close() throws IOException { + IOUtils.close(vectorDataReader); + } + + @Override + public void checkIntegrity() throws IOException { + // TODO: Pending implementation + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + throw new UnsupportedOperationException(); + /*return new FloatVectorValues() { + + int pos = -1; + + @Override + public int nextDoc() throws IOException { + pos++; + int size = cuvsIndexes.get(field).get(0).getMapping().size(); + if (pos >= size) return FloatVectorValues.NO_MORE_DOCS; + return cuvsIndexes.get(field).get(0).getMapping().get(pos); + } + + @Override + public int docID() { + return cuvsIndexes.get(field).get(0).getMapping().get(pos); + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public float[] vectorValue() throws IOException { + return cuvsIndexes.get(field).get(0).getVectors().get(pos); + + } + + @Override + public int size() { + return cuvsIndexes.get(field).get(0).getVectors().size(); + } + + @Override + public VectorScorer scorer(float[] query) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public int dimension() { + // TODO Auto-generated method stub + return cuvsIndexes.get(field).get(0).getVectors().get(0).length; + } + };*/ + } + + @Override + public ByteVectorValues getByteVectorValues(String field) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + PerLeafCuVSKnnCollector cuvsCollector = knnCollector instanceof PerLeafCuVSKnnCollector? ((PerLeafCuVSKnnCollector)knnCollector): new PerLeafCuVSKnnCollector(knnCollector.k(), knnCollector.k(), 1); + TopKnnCollector defaultCollector = knnCollector instanceof TopKnnCollector? ((TopKnnCollector)knnCollector): null; + + int prevDocCount = 0; + + // log.debug("Will try to search all the indexes for segment "+segmentState.segmentInfo.name+", field "+field+": "+cuvsIndexes); + for (CuVSIndex cuvsIndex: cuvsIndexes.get(field)) { + try { + Map result = new HashMap(); + if (cuvsCollector.k() <= 1024) { + CagraSearchParams searchParams = new CagraSearchParams.Builder(resources) + .withItopkSize(cuvsCollector.iTopK) + .withSearchWidth(cuvsCollector.searchWidth) + .build(); + + CagraQuery query = new CagraQuery.Builder() + .withTopK(cuvsCollector.k()) + .withSearchParams(searchParams) + .withMapping(cuvsIndex.getMapping()) + .withQueryVectors(new float[][] {target}) + .build(); + + CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); + assert (cagraIndex != null); + log.info("k is " + cuvsCollector.k()); + result = cagraIndex.search(query).getResults().get(0); // List expected to have only one entry because of single query "target". + log.info("INTERMEDIATE RESULT FROM CUVS: " + result + ", prevDocCount=" + prevDocCount); + } else { + BruteForceQuery bruteforceQuery = new BruteForceQuery.Builder() + .withQueryVectors(new float[][] { target }) + .withPrefilter(((FixedBitSet)acceptDocs).getBits()) + .withTopK(cuvsCollector.k()) + .build(); + + BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); + result = bruteforceIndex.search(bruteforceQuery).getResults().get(0); + } + + for(Entry kv : result.entrySet()) { + if (defaultCollector != null) { + defaultCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); + } + cuvsCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); + } + + } catch (Throwable e) { + e.printStackTrace(); + } + prevDocCount += cuvsIndex.getMaxDocs(); + } + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java new file mode 100644 index 000000000000..1da7ca0f9e6c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -0,0 +1,339 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.logging.Logger; + +import org.apache.commons.lang3.SerializationUtils; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter.DocMap; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.IOUtils; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceIndexParams; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraIndexParams; +import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; +import com.nvidia.cuvs.CuVSResources; + +public class CuVSVectorsWriter extends KnnVectorsWriter { + + protected Logger log = Logger.getLogger(getClass().getName()); + + private List fieldVectorWriters = new ArrayList<>(); + private IndexOutput cuVSIndex = null; + private SegmentWriteState segmentWriteState = null; + private String cuVSDataFilename = null; + + private CagraIndex cagraIndex; + private CagraIndex cagraIndexForHnsw; + + private int cuvsWriterThreads; + private int intGraphDegree; + private int graphDegree; + private MergeStrategy mergeStrategy; + private CuVSResources resources; + + public enum MergeStrategy { + TRIVIAL_MERGE, NON_TRIVIAL_MERGE + }; + + public CuVSVectorsWriter(SegmentWriteState state, int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy, CuVSResources resources) + throws IOException { + super(); + this.segmentWriteState = state; + this.mergeStrategy = mergeStrategy; + this.cuvsWriterThreads = cuvsWriterThreads; + this.intGraphDegree = intGraphDegree; + this.graphDegree = graphDegree; + this.resources = resources; + + cuVSDataFilename = IndexFileNames.segmentFileName(this.segmentWriteState.segmentInfo.name, this.segmentWriteState.segmentSuffix, CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + } + + @Override + public long ramBytesUsed() { + return 0; + } + + @Override + public void close() throws IOException { + IOUtils.close(cuVSIndex); + cuVSIndex = null; + fieldVectorWriters.clear(); + fieldVectorWriters = null; + } + + @Override + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + CagraFieldVectorsWriter cagraFieldVectorWriter = new CagraFieldVectorsWriter(fieldInfo); + fieldVectorWriters.add(cagraFieldVectorWriter); + return cagraFieldVectorWriter; + } + + private byte[] createCagraIndex(float[][] vectors, List mapping) throws Throwable { + CagraIndexParams indexParams = new CagraIndexParams.Builder(resources) + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + + log.info("Indexing started: " + System.currentTimeMillis()); + cagraIndex = new CagraIndex.Builder(resources) + .withDataset(vectors) + .withIndexParams(indexParams) + .build(); + log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + vectors.length); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + File tmpFile = File.createTempFile("tmpindex", "cag"); // TODO: Should we make this a file with random names? + cagraIndex.serialize(baos, tmpFile); + return baos.toByteArray(); + } + + private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { + BruteForceIndexParams indexParams = new BruteForceIndexParams.Builder() + .withNumWriterThreads(32) // TODO: Make this configurable later. + .build(); + + log.info("Indexing started: " + System.currentTimeMillis()); + BruteForceIndex index = new BruteForceIndex.Builder(resources) + .withIndexParams(indexParams) + .withDataset(vectors) + .build(); + + log.info("Indexing done: " + System.currentTimeMillis()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + index.serialize(baos); + return baos.toByteArray(); + } + + private byte[] createHnswIndex(float[][] vectors) throws Throwable { + CagraIndexParams indexParams = new CagraIndexParams.Builder(resources) + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + + log.info("Indexing started: " + System.currentTimeMillis()); + cagraIndexForHnsw = new CagraIndex.Builder(resources) + .withDataset(vectors) + .withIndexParams(indexParams) + .build(); + log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + vectors.length); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + File tmpFile = File.createTempFile("tmpindex", "hnsw"); + cagraIndexForHnsw.serializeToHNSW(baos, tmpFile); + return baos.toByteArray(); + } + + @SuppressWarnings({"resource", "rawtypes", "unchecked"}) + @Override + public void flush(int maxDoc, DocMap sortMap) throws IOException { + cuVSIndex = this.segmentWriteState.directory.createOutput(cuVSDataFilename, this.segmentWriteState.context); + CodecUtil.writeIndexHeader(cuVSIndex, CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, CuVSVectorsFormat.VERSION_CURRENT, this.segmentWriteState.segmentInfo.getId(), this.segmentWriteState.segmentSuffix); + + + CuVSSegmentFile cuVSFile = new CuVSSegmentFile(new SegmentOutputStream(cuVSIndex, 100000)); + + LinkedHashMap metaMap = new LinkedHashMap(); + + for (CagraFieldVectorsWriter field : fieldVectorWriters) { + long start = System.currentTimeMillis(); + + byte[] cagraIndexBytes = null; + byte[] bruteForceIndexBytes = null; + byte[] hnswIndexBytes = null; + try { + log.info("Starting CAGRA indexing, space remaining: "+new File("/").getFreeSpace()); + log.info("Starting CAGRA indexing, docs: " + field.vectors.size()); + + float vectors[][] = new float[field.vectors.size()][field.vectors.get(0).length]; + for (int i = 0; i < vectors.length; i++) { + for (int j = 0; j < vectors[i].length; j++) { + vectors[i][j] = field.vectors.get(i)[j]; + } + } + + cagraIndexBytes = createCagraIndex(vectors, new ArrayList(field.vectors.keySet())); // nocommit + bruteForceIndexBytes = createBruteForceIndex(vectors); + hnswIndexBytes = createHnswIndex(vectors); + } catch (Throwable e) { + e.printStackTrace(); + } + + start = System.currentTimeMillis(); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".cag", cagraIndexBytes); + log.info("time for writing CAGRA index bytes to zip: " + (System.currentTimeMillis() - start)); + + start = System.currentTimeMillis(); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".bf", bruteForceIndexBytes); + log.info("time for writing BRUTEFORCE index bytes to zip: " + (System.currentTimeMillis() - start)); + + start = System.currentTimeMillis(); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".hnsw", hnswIndexBytes); + log.info("time for writing HNSW index bytes to zip: " + (System.currentTimeMillis() - start)); + + start = System.currentTimeMillis(); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".vec", SerializationUtils.serialize(new ArrayList(field.vectors.values()))); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".map", SerializationUtils.serialize(new ArrayList(field.vectors.keySet()))); + log.info("list serializing and writing: " + (System.currentTimeMillis() - start)); + field.vectors.clear(); + } + + metaMap.put(segmentWriteState.segmentInfo.name, maxDoc); + cuVSFile.addFile(segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); + cuVSFile.close(); + + CodecUtil.writeFooter(cuVSIndex); + } + + SegmentOutputStream mergeOutputStream = null; + CuVSSegmentFile mergedIndexFile = null; + + @SuppressWarnings("resource") + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + List segInputStreams = new ArrayList(); + List readers = new ArrayList(); + + for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { + CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; + segInputStreams.add(reader.segmentInputStream); + readers.add(reader); + } + + log.info("Merging one field for segment: " + segmentWriteState.segmentInfo.name); + log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + + if (!List.of(segmentWriteState.directory.listAll()).contains(cuVSDataFilename)) { + IndexOutput mergedVectorIndex = segmentWriteState.directory.createOutput(cuVSDataFilename, segmentWriteState.context); + CodecUtil.writeIndexHeader(mergedVectorIndex, CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, + CuVSVectorsFormat.VERSION_CURRENT, segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); + this.mergeOutputStream = new SegmentOutputStream(mergedVectorIndex, 100000); + mergedIndexFile = new CuVSSegmentFile(this.mergeOutputStream); + } + + log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + + if (mergeStrategy.equals(MergeStrategy.TRIVIAL_MERGE)) { + Util.getMergedArchiveCOS(segInputStreams, segmentWriteState.segmentInfo.name, this.mergeOutputStream + ); + } else if (mergeStrategy.equals(MergeStrategy.NON_TRIVIAL_MERGE)) { + // nocommit: this doesn't merge all the fields + log.info("Readers: "+segInputStreams.size()+", deocMaps: "+mergeState.docMaps.length); + ArrayList docMapList = new ArrayList(); + + for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { + CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; + for (CuVSIndex index: reader.cuvsIndexes.get(fieldInfo.name)) { + log.info("Mapping for segment ("+reader.fileName+"): " + index.getMapping()); + log.info("Mapping for segment ("+reader.fileName+"): " + index.getMapping().size()); + for (int id=0; id mergedVectors = Util.getMergedVectors(segInputStreams, fieldInfo.name, segmentWriteState.segmentInfo.name); + log.info("Final mapping: " + docMapList); + log.info("Final mapping: " + docMapList.size()); + log.info("Merged vectors: " + mergedVectors.size()); + LinkedHashMap metaMap = new LinkedHashMap(); + byte[] cagraIndexBytes = null; + byte[] bruteForceIndexBytes = null; + byte[] hnswIndexBytes = null; + try { + float vectors[][] = new float[mergedVectors.size()][mergedVectors.get(0).length]; + for (int i = 0; i < vectors.length; i++) { + for (int j = 0; j < vectors[i].length; j++) { + vectors[i][j] = mergedVectors.get(i)[j]; + } + } + cagraIndexBytes = createCagraIndex(vectors, new ArrayList()); + bruteForceIndexBytes = createBruteForceIndex(vectors); + hnswIndexBytes = createHnswIndex(vectors); + } catch (Throwable e) { + e.printStackTrace(); + } + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".cag", cagraIndexBytes); + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".bf", bruteForceIndexBytes); + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".hnsw", hnswIndexBytes); + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".vec", SerializationUtils.serialize(mergedVectors)); + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".map", SerializationUtils.serialize(docMapList)); + metaMap.put(segmentWriteState.segmentInfo.name, mergedVectors.size()); + if (mergedIndexFile.getFilesAdded().contains(segmentWriteState.segmentInfo.name + ".meta") == false) { + mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); + } + log.info("DocMaps: "+Arrays.toString(mergeState.docMaps)); + + metaMap.clear(); + } + } + + + @Override + public void finish() throws IOException { + if (this.mergeOutputStream!=null) { + mergedIndexFile.close(); + CodecUtil.writeFooter(mergeOutputStream.out); + IOUtils.close(mergeOutputStream.out); + this.mergeOutputStream = null; + this.mergedIndexFile = null; + } + } + + public class SegmentOutputStream extends OutputStream { + + IndexOutput out; + int bufferSize; + byte[] buffer; + int p; + + public SegmentOutputStream(IndexOutput out, int bufferSize) throws IOException { + super(); + this.out = out; + this.bufferSize = bufferSize; + this.buffer = new byte[this.bufferSize]; + } + + @Override + public void write(int b) throws IOException { + buffer[p] = (byte) b; + p += 1; + if (p == bufferSize) { + flush(); + } + } + + @Override + public void flush() throws IOException { + out.writeBytes(buffer, p); + p = 0; + } + + @Override + public void close() throws IOException { + this.flush(); + } + + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java new file mode 100644 index 000000000000..d4d19fad7041 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -0,0 +1,74 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHits; + +public class PerLeafCuVSKnnCollector implements KnnCollector { + + public List scoreDocs; + public int topK = 0; + public int iTopK = topK; // TODO getter, no setter + public int searchWidth = 1; // TODO getter, no setter + public int results = 0; + + public PerLeafCuVSKnnCollector(int topK, int iTopK, int searchWidth) { + super(); + this.topK = topK; + this.iTopK = iTopK; + this.searchWidth = searchWidth; + scoreDocs = new ArrayList(); + } + + @Override + public boolean earlyTerminated() { + // TODO: may need implementation + return false; + } + + @Override + public void incVisitedCount(int count) { + // TODO: may need implementation + } + + @Override + public long visitedCount() { + // TODO: may need implementation + return 0; + } + + @Override + public long visitLimit() { + // TODO: may need implementation + return 0; + } + + @Override + public int k() { + return topK; + } + + @Override + @SuppressWarnings("cast") + public boolean collect(int docId, float similarity) { + scoreDocs.add(new ScoreDoc(docId, 1f/(float)(similarity))); + return true; + } + + @Override + public float minCompetitiveSimilarity() { + // TODO: may need implementation + return 0; + } + + @Override + public TopDocs topDocs() { + return new TopDocs(new TotalHits(scoreDocs.size(), TotalHits.Relation.EQUAL_TO), + scoreDocs.toArray(new ScoreDoc[scoreDocs.size()])); + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java new file mode 100644 index 000000000000..a352269fbb1b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -0,0 +1,90 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.lucene.store.IndexInput; + +public class SegmentInputStream extends InputStream { + + /** + * + */ + private final IndexInput indexInput; + public final long initialFilePointerPosition; + public final long limit; + public long pos = 0; + + // TODO: This input stream needs to be modified to enable buffering. + public SegmentInputStream(IndexInput indexInput, long limit, long initialFilePointerPosition) throws IOException { + super(); + this.indexInput = indexInput; + this.initialFilePointerPosition = initialFilePointerPosition; + this.limit = limit; + + this.indexInput.seek(initialFilePointerPosition); + } + + @Override + public int read() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int read(byte[] b, int off, int len) { + try { + long avail = limit - pos; + if (pos >= limit) { + return -1; + } + if (len > avail) { + len = (int) avail; + } + if (len <= 0) { + return 0; + } + indexInput.readBytes(b, off, len); + pos += len; + return len; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public int read(byte[] b) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void reset() throws IOException { + indexInput.seek(initialFilePointerPosition); + pos = 0; + } + + @Override + public long skip(long n) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean markSupported() { + return true; + } + + @Override + public void mark(int readlimit) { + throw new UnsupportedOperationException(); + } + + @Override + public void close() { + // Do nothing for now. + } + + @Override + public int available() { + throw new UnsupportedOperationException(); + } + +} \ No newline at end of file diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java new file mode 100644 index 000000000000..a8200e7b897b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -0,0 +1,142 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Logger; +import java.util.zip.Deflater; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipOutputStream; + +public class Util { + + public static ByteArrayOutputStream getZipEntryBAOS(String fileName, SegmentInputStream segInputStream) + throws IOException { + segInputStream.reset(); + ZipInputStream zipInputStream = new ZipInputStream(segInputStream); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + boolean fileFound = false; + ZipEntry zipEntry; + while (zipInputStream.available() == 1 && ((zipEntry = zipInputStream.getNextEntry()) != null)) { + if (zipEntry.getName().equals(fileName)) { + fileFound = true; + byte[] buffer = new byte[1024]; + int length; + while ((length = zipInputStream.read(buffer)) != -1) { + baos.write(buffer, 0, length); + } + } + } + if (!fileFound) throw new FileNotFoundException(); + return baos; + } + + private static final Logger log = Logger.getLogger(Util.class.getName()); + + public static ArrayList getMergedVectors(List segInputStreams, String fieldName, String mergedSegmentName) + throws IOException { + ZipEntry zs; + ArrayList mergedVectors = new ArrayList(); + log.info("Getting mergedVectors..."); + for (SegmentInputStream segInputStream : segInputStreams) { + segInputStream.reset(); + ZipInputStream zipStream = new ZipInputStream(segInputStream); + while ((zs = zipStream.getNextEntry()) != null) { + log.info("Getting mergedVectors... " + zs.getName()); + byte[] buffer = new byte[1024]; + int length; + if (zs.getName().endsWith(".vec")) { + String field = zs.getName().split("\\.")[0].split("/")[1]; + if (fieldName.equals(field)) { + ByteArrayOutputStream baosM = new ByteArrayOutputStream(); + while ((length = zipStream.read(buffer)) != -1) { + baosM.write(buffer, 0, length); + } + List m = deSerializeListInMemory(baosM.toByteArray()); + mergedVectors.addAll(m); + } + } + } + } + return mergedVectors; + } + + public static void getMergedArchiveCOS(List segInputStreams, String mergedSegmentName, + OutputStream os) throws IOException { + ZipOutputStream zos = new ZipOutputStream(os); + ZipEntry zs; + Map mergedMetaMap = new LinkedHashMap(); + for (SegmentInputStream segInputStream : segInputStreams) { + segInputStream.reset(); + ZipInputStream zipStream = new ZipInputStream(segInputStream); + while ((zs = zipStream.getNextEntry()) != null) { + byte[] buffer = new byte[1024]; + int length; + if (zs.getName().endsWith(".meta")) { + ByteArrayOutputStream baosM = new ByteArrayOutputStream(); + while ((length = zipStream.read(buffer)) != -1) { + baosM.write(buffer, 0, length); + } + Map m = deSerializeMapInMemory(baosM.toByteArray()); + mergedMetaMap.putAll(m); + } else { + ZipEntry zipEntry = new ZipEntry(zs.getName()); + zos.putNextEntry(zipEntry); + zos.setLevel(Deflater.NO_COMPRESSION); + while ((length = zipStream.read(buffer)) != -1) { + zos.write(buffer, 0, length); + } + zos.closeEntry(); + } + } + } + // Finally put the merged meta file + ZipEntry mergedMetaZipEntry = new ZipEntry(mergedSegmentName + ".meta"); + zos.putNextEntry(mergedMetaZipEntry); + zos.setLevel(Deflater.NO_COMPRESSION); + new ObjectOutputStream(zos).writeObject(mergedMetaMap); // Java serialization should be avoided + zos.closeEntry(); + zos.close(); + } + + @SuppressWarnings("unchecked") + public static Map deSerializeMapInMemory(byte[] bytes) { + Map map = null; + ObjectInputStream ois = null; + try { + ois = new ObjectInputStream(new ByteArrayInputStream(bytes)); + map = (Map) ois.readObject(); + ois.close(); + } catch (Exception e) { + e.printStackTrace(); + } + + return map; + } + + @SuppressWarnings("unchecked") + public static List deSerializeListInMemory(byte[] bytes) { + List map = null; + ObjectInputStream ois = null; + try { + ois = new ObjectInputStream(new ByteArrayInputStream(bytes)); + map = (List) ois.readObject(); + ois.close(); + } catch (Exception e) { + e.printStackTrace(); + } + + return map; + } + +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java new file mode 100644 index 000000000000..67199edca2f6 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -0,0 +1 @@ +package org.apache.lucene.sandbox.vectorsearch; diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java new file mode 100644 index 000000000000..89ee9a3879ba --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java @@ -0,0 +1,201 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.TreeMap; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.English; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks; +import org.apache.lucene.tests.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@SuppressSysoutChecks(bugUrl = "prints info from within cuvs") +public class IntegrationTest extends LuceneTestCase { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private static IndexSearcher searcher; + private static IndexReader reader; + private static Directory directory; + + public static int DATASET_SIZE_LIMIT = 1000; + public static int DIMENSIONS_LIMIT = 2048; + public static int NUM_QUERIES_LIMIT = 10; + public static int TOP_K_LIMIT = 64; // nocommit This fails beyond 64 + + public static float[][] dataset = null; + + @BeforeClass + public static void beforeClass() throws Exception { + directory = newDirectory(); + + Codec codec = new CuVSCodec(); + + RandomIndexWriter writer = + new RandomIndexWriter( + random(), + directory, + newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setCodec(codec) + .setMergePolicy(newTieredMergePolicy())); + + log.info("Merge Policy: " + writer.w.getConfig().getMergePolicy()); + + Random random = random(); + int datasetSize = random.nextInt(DATASET_SIZE_LIMIT) + 1; + int dimensions = random.nextInt(DIMENSIONS_LIMIT) + 1; + dataset = generateDataset(random, datasetSize, dimensions); + for (int i = 0; i < datasetSize; i++) { + Document doc = new Document(); + doc.add(new StringField("id", String.valueOf(i), Field.Store.YES)); + doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES)); + boolean skipVector = random.nextInt(10) < 0; // nocommit disable testing with holes for now, there's some bug. + if (!skipVector || datasetSize<100) { // about 10th of the documents shouldn't have a single vector + doc.add(new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new KnnFloatVectorField("vector2", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); + } + + writer.addDocument(doc); + } + + reader = writer.getReader(); + searcher = newSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + // nocommit This fails until flat vectors are implemented + reader.close(); + directory.close(); + searcher = null; + reader = null; + directory = null; + log.info("Test finished"); + } + + @Test + public void testVectorSearch() throws IOException { + Random random = random(); + int numQueries = random.nextInt(NUM_QUERIES_LIMIT) + 1; + int topK = Math.min(random.nextInt(TOP_K_LIMIT) + 1, dataset.length); + + if(dataset.length < topK) topK = dataset.length; + + float[][] queries = generateQueries(random, dataset[0].length, numQueries); + List> expected = generateExpectedResults(topK, dataset, queries); + + debugPrintDatasetAndQueries(dataset, queries); + + log.info("Dataset size: {}x{}", dataset.length, dataset[0].length); + log.info("Query size: {}x{}", numQueries, queries[0].length); + log.info("TopK: {}", topK); + + Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); + int correct[] = new int[topK]; + for (int i=0; i> generateExpectedResults(int topK, float[][] dataset, float[][] queries) { + List> neighborsResult = new ArrayList<>(); + int dimensions = dataset[0].length; + + for (float[] query : queries) { + Map distances = new TreeMap<>(); + for (int j = 0; j < dataset.length; j++) { + double distance = 0; + for (int k = 0; k < dimensions; k++) { + distance += (query[k] - dataset[j][k]) * (query[k] - dataset[j][k]); + } + distances.put(j, (distance)); + } + + Map sorted = new TreeMap(distances); + log.info("EXPECTED: " + sorted); + + // Sort by distance and select the topK nearest neighbors + List neighbors = distances.entrySet().stream() + .sorted(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey) + .toList(); + neighborsResult.add(neighbors.subList(0, Math.min(topK * 3, dataset.length))); // generate double the topK results in the expected array + } + + log.info("Expected results generated successfully."); + return neighborsResult; + } +} diff --git a/versions.toml b/versions.toml index 80dc51f39bf2..327848fd10d4 100644 --- a/versions.toml +++ b/versions.toml @@ -4,6 +4,8 @@ asm = "9.6" assertj = "3.21.0" commons-codec = "1.13" commons-compress = "1.19" +commons-lang3 = "3.17.0" +cuvs = "25.02" ecj = "3.36.0" errorprone = "2.18.0" flexmark = "0.61.24" @@ -33,6 +35,7 @@ s2-geometry = "1.0.0" spatial4j = "0.8" xerces = "2.12.0" zstd = "1.5.5-11" +jackson-core = "2.18.2" [libraries] antlr-core = { module = "org.antlr:antlr4", version.ref = "antlr" } @@ -42,6 +45,8 @@ asm-core = { module = "org.ow2.asm:asm", version.ref = "asm" } assertj = { module = "org.assertj:assertj-core", version.ref = "assertj" } commons-codec = { module = "commons-codec:commons-codec", version.ref = "commons-codec" } commons-compress = { module = "org.apache.commons:commons-compress", version.ref = "commons-compress" } +commons-lang3 = { module = "org.apache.commons:commons-lang3", version.ref = "commons-lang3" } +cuvs = { module = "com.nvidia.cuvs:cuvs-java", version.ref = "cuvs" } ecj = { module = "org.eclipse.jdt:ecj", version.ref = "ecj" } errorprone = { module = "com.google.errorprone:error_prone_core", version.ref = "errorprone" } flexmark-core = { module = "com.vladsch.flexmark:flexmark", version.ref = "flexmark" } @@ -52,6 +57,7 @@ flexmark-ext-tables = { module = "com.vladsch.flexmark:flexmark-ext-tables", ver groovy = { module = "org.apache.groovy:groovy-all", version.ref = "groovy" } hamcrest = { module = "org.hamcrest:hamcrest", version.ref = "hamcrest" } icu4j = { module = "com.ibm.icu:icu4j", version.ref = "icu4j" } +jackson-core = { module = "com.fasterxml.jackson.core:jackson-core", version.ref = "jackson-core" } javacc = { module = "net.java.dev.javacc:javacc", version.ref = "javacc" } jflex = { module = "de.jflex:jflex", version.ref = "jflex" } jgit = { module = "org.eclipse.jgit:org.eclipse.jgit", version.ref = "jgit" } From 0e9f6d4bc9a98eb33d594409ce8e4b3a6b4b1a06 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Tue, 7 Jan 2025 21:28:17 +0530 Subject: [PATCH 02/88] Test fixes --- .../services/org.apache.lucene.codecs.Codec | 1 + .../org.apache.lucene.codecs.KnnVectorsFormat | 16 ++++++++++++++++ .../{IntegrationTest.java => TestCuVS.java} | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec create mode 100644 lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat rename lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/{IntegrationTest.java => TestCuVS.java} (99%) diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec new file mode 100644 index 000000000000..38b31884377d --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -0,0 +1 @@ +org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat new file mode 100644 index 000000000000..666ee726f986 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java similarity index 99% rename from lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java index 89ee9a3879ba..15a023d6fbd3 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/IntegrationTest.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -34,7 +34,7 @@ import org.slf4j.LoggerFactory; @SuppressSysoutChecks(bugUrl = "prints info from within cuvs") -public class IntegrationTest extends LuceneTestCase { +public class TestCuVS extends LuceneTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); From a95f084e1d5a9d16128bd133e0631b193eed8709 Mon Sep 17 00:00:00 2001 From: Vivek Narang Date: Tue, 7 Jan 2025 12:32:57 -0500 Subject: [PATCH 03/88] fix for getFloatVectorValues --- .../vectorsearch/CuVSVectorsReader.java | 40 ++++--------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index cac870afec6c..837a9229d061 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -196,52 +196,28 @@ public void checkIntegrity() throws IOException { @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { - throw new UnsupportedOperationException(); - /*return new FloatVectorValues() { - - int pos = -1; - - @Override - public int nextDoc() throws IOException { - pos++; - int size = cuvsIndexes.get(field).get(0).getMapping().size(); - if (pos >= size) return FloatVectorValues.NO_MORE_DOCS; - return cuvsIndexes.get(field).get(0).getMapping().get(pos); - } + return new FloatVectorValues() { @Override - public int docID() { - return cuvsIndexes.get(field).get(0).getMapping().get(pos); + public int size() { + return cuvsIndexes.get(field).get(0).getVectors().size(); } @Override - public int advance(int target) throws IOException { - throw new UnsupportedOperationException(); + public int dimension() { + return cuvsIndexes.get(field).get(0).getVectors().get(0).length; } @Override - public float[] vectorValue() throws IOException { + public float[] vectorValue(int pos) throws IOException { return cuvsIndexes.get(field).get(0).getVectors().get(pos); - } @Override - public int size() { - return cuvsIndexes.get(field).get(0).getVectors().size(); - } - - @Override - public VectorScorer scorer(float[] query) throws IOException { - // TODO Auto-generated method stub + public FloatVectorValues copy() throws IOException { return null; } - - @Override - public int dimension() { - // TODO Auto-generated method stub - return cuvsIndexes.get(field).get(0).getVectors().get(0).length; - } - };*/ + }; } @Override From d4f0a3244e51484b82e520647336b927656b25da Mon Sep 17 00:00:00 2001 From: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:30:44 +0000 Subject: [PATCH 04/88] prefetch may select the wrong memory segment for multi-segment slices (#14109) This commit fixes a bug where by prefetch may select the wrong memory segment for multi-segment slices. The issue was discovered when debugging a large test scenario, where the index input was backed by several memory segments. When sliced, a multi-segment index input uses an offset into the initial memory segment. This offset should be added to the prefetch offset to determine the absolute offset. --- lucene/CHANGES.txt | 4 +- .../lucene/store/MemorySegmentIndexInput.java | 16 ++++++-- .../lucene/store/TestMMapDirectory.java | 38 +++++++++++++++++++ 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2e574668a273..f3498cd5738e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -61,7 +61,9 @@ Optimizations Bug Fixes --------------------- -(No changes) + +* GITHUB#14109: prefetch may select the wrong memory segment for + multi-segment slices. (Chris Hegarty) Other --------------------- diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index 2424b53645bd..74594be5ec99 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -337,8 +337,6 @@ public void prefetch(long offset, long length) throws IOException { ensureOpen(); - Objects.checkFromIndexSize(offset, length, length()); - if (BitUtil.isZeroOrPowerOfTwo(consecutivePrefetchHitCount++) == false) { // We've had enough consecutive hits on the page cache that this number is neither zero nor a // power of two. There is a good chance that a good chunk of this index input is cached in @@ -381,8 +379,6 @@ void advise(long offset, long length, IOConsumer advice) throws I ensureOpen(); - Objects.checkFromIndexSize(offset, length, length()); - final NativeAccess nativeAccess = NATIVE_ACCESS.get(); try { @@ -818,6 +814,12 @@ public MemorySegment segmentSliceOrNull(long pos, long len) throws IOException { throw handlePositionalIOOBE(e, "segmentSliceOrNull", pos); } } + + @Override + public void prefetch(long offset, long length) throws IOException { + Objects.checkFromIndexSize(offset, length, this.length); + super.prefetch(offset, length); + } } /** This class adds offset support to MemorySegmentIndexInput, which is needed for slices. */ @@ -903,5 +905,11 @@ public MemorySegment segmentSliceOrNull(long pos, long len) throws IOException { MemorySegmentIndexInput buildSlice(String sliceDescription, long ofs, long length) { return super.buildSlice(sliceDescription, this.offset + ofs, length); } + + @Override + public void prefetch(long offset, long length) throws IOException { + Objects.checkFromIndexSize(offset, length, this.length); + super.prefetch(this.offset + offset, length); + } } } diff --git a/lucene/core/src/test/org/apache/lucene/store/TestMMapDirectory.java b/lucene/core/src/test/org/apache/lucene/store/TestMMapDirectory.java index d01d6ec50ebb..f69befca850c 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestMMapDirectory.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestMMapDirectory.java @@ -329,4 +329,42 @@ public void testNoGroupingFunc() { assertFalse(func.apply("segment.si").isPresent()); assertFalse(func.apply("_51a.si").isPresent()); } + + public void testPrefetchWithSingleSegment() throws IOException { + testPrefetchWithSegments(64 * 1024); + } + + public void testPrefetchWithMultiSegment() throws IOException { + testPrefetchWithSegments(16 * 1024); + } + + static final Class IOOBE = IndexOutOfBoundsException.class; + + // does not verify that the actual segment is prefetched, but rather exercises the code and bounds + void testPrefetchWithSegments(int maxChunkSize) throws IOException { + byte[] bytes = new byte[(maxChunkSize * 2) + 1]; + try (Directory dir = + new MMapDirectory(createTempDir("testPrefetchWithSegments"), maxChunkSize)) { + try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) { + out.writeBytes(bytes, 0, bytes.length); + } + + try (var in = dir.openInput("test", IOContext.READONCE)) { + in.prefetch(0, in.length()); + expectThrows(IOOBE, () -> in.prefetch(1, in.length())); + expectThrows(IOOBE, () -> in.prefetch(in.length(), 1)); + + var slice1 = in.slice("slice-1", 1, in.length() - 1); + slice1.prefetch(0, slice1.length()); + expectThrows(IOOBE, () -> slice1.prefetch(1, slice1.length())); + expectThrows(IOOBE, () -> slice1.prefetch(slice1.length(), 1)); + + // we sliced off all but one byte from the first complete memory segment + var slice2 = in.slice("slice-2", maxChunkSize - 1, in.length() - maxChunkSize + 1); + slice2.prefetch(0, slice2.length()); + expectThrows(IOOBE, () -> slice2.prefetch(1, slice2.length())); + expectThrows(IOOBE, () -> slice2.prefetch(slice2.length(), 1)); + } + } + } } From 11eb2c86979e79b34cf5420ba2d9e3c4e322d938 Mon Sep 17 00:00:00 2001 From: Ben Chaplin Date: Wed, 8 Jan 2025 13:26:35 -0500 Subject: [PATCH 05/88] Add some basic HNSW graph checks to CheckIndex (#13984) --- .../org/apache/lucene/index/CheckIndex.java | 229 ++++++++++++++++++ 1 file changed, 229 insertions(+) diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index d957af01d0a2..b3a5e4dc5d11 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -26,8 +26,11 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.text.NumberFormat; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.Deque; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -52,12 +55,14 @@ import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; +import org.apache.lucene.codecs.hnsw.HnswGraphProvider; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.DocumentStoredFieldVisitor; import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.internal.hppc.IntIntHashMap; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.KnnCollector; @@ -74,6 +79,7 @@ import org.apache.lucene.store.Lock; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil.ByteArrayComparator; +import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -91,6 +97,7 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.hnsw.HnswGraph; /** * Basic tool and API to check the health of an index and write a new segments file that removes @@ -249,6 +256,9 @@ public static class SegmentInfoStatus { /** Status of vectors */ public VectorValuesStatus vectorValuesStatus; + /** Status of HNSW graph */ + public HnswGraphsStatus hnswGraphsStatus; + /** Status of soft deletes */ public SoftDeletesStatus softDeletesStatus; @@ -406,6 +416,32 @@ public static final class VectorValuesStatus { public Throwable error; } + /** Status from testing a single HNSW graph */ + public static final class HnswGraphStatus { + + HnswGraphStatus() {} + + /** Number of nodes at each level */ + public List numNodesAtLevel; + + /** Connectedness at each level represented as a fraction */ + public List connectednessAtLevel; + } + + /** Status from testing all HNSW graphs */ + public static final class HnswGraphsStatus { + + HnswGraphsStatus() { + this.hnswGraphsStatusByField = new HashMap<>(); + } + + /** Status of the HNSW graph keyed with field name */ + public Map hnswGraphsStatusByField; + + /** Exception thrown during term index test (null on success) */ + public Throwable error; + } + /** Status from testing index sort */ public static final class IndexSortStatus { IndexSortStatus() {} @@ -1085,6 +1121,9 @@ private Status.SegmentInfoStatus testSegment( // Test FloatVectorValues and ByteVectorValues segInfoStat.vectorValuesStatus = testVectors(reader, infoStream, failFast); + // Test HNSW graph + segInfoStat.hnswGraphsStatus = testHnswGraphs(reader, infoStream, failFast); + // Test Index Sort if (indexSort != null) { segInfoStat.indexSortStatus = testSort(reader, indexSort, infoStream, failFast); @@ -2746,6 +2785,196 @@ public static Status.VectorValuesStatus testVectors( return status; } + /** Test the HNSW graph. */ + public static Status.HnswGraphsStatus testHnswGraphs( + CodecReader reader, PrintStream infoStream, boolean failFast) throws IOException { + if (infoStream != null) { + infoStream.print(" test: hnsw graphs........."); + } + long startNS = System.nanoTime(); + Status.HnswGraphsStatus status = new Status.HnswGraphsStatus(); + KnnVectorsReader vectorsReader = reader.getVectorReader(); + FieldInfos fieldInfos = reader.getFieldInfos(); + + try { + if (fieldInfos.hasVectorValues()) { + for (FieldInfo fieldInfo : fieldInfos) { + if (fieldInfo.hasVectorValues()) { + KnnVectorsReader fieldReader = getFieldReaderForName(vectorsReader, fieldInfo.name); + if (fieldReader instanceof HnswGraphProvider graphProvider) { + HnswGraph hnswGraph = graphProvider.getGraph(fieldInfo.name); + testHnswGraph(hnswGraph, fieldInfo.name, status); + } + } + } + } + msg( + infoStream, + String.format( + Locale.ROOT, + "OK [%d fields] [took %.3f sec]", + status.hnswGraphsStatusByField.size(), + nsToSec(System.nanoTime() - startNS))); + printHnswInfo(infoStream, status.hnswGraphsStatusByField); + } catch (Exception e) { + if (failFast) { + throw IOUtils.rethrowAlways(e); + } + msg(infoStream, "ERROR: " + e); + status.error = e; + if (infoStream != null) { + e.printStackTrace(infoStream); + } + } + + return status; + } + + private static KnnVectorsReader getFieldReaderForName( + KnnVectorsReader vectorsReader, String fieldName) { + if (vectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader fieldsReader) { + return fieldsReader.getFieldReader(fieldName); + } else { + return vectorsReader; + } + } + + private static void printHnswInfo( + PrintStream infoStream, Map fieldsStatus) { + for (Map.Entry entry : fieldsStatus.entrySet()) { + String fieldName = entry.getKey(); + CheckIndex.Status.HnswGraphStatus status = entry.getValue(); + msg(infoStream, " hnsw field name: " + fieldName); + + int numLevels = Math.min(status.numNodesAtLevel.size(), status.connectednessAtLevel.size()); + for (int level = numLevels - 1; level >= 0; level--) { + int numNodes = status.numNodesAtLevel.get(level); + String connectedness = status.connectednessAtLevel.get(level); + msg( + infoStream, + String.format( + Locale.ROOT, + " level %d: %d nodes, %s connected", + level, + numNodes, + connectedness)); + } + } + } + + private static void testHnswGraph( + HnswGraph hnswGraph, String fieldName, Status.HnswGraphsStatus status) + throws IOException, CheckIndexException { + if (hnswGraph != null) { + status.hnswGraphsStatusByField.put(fieldName, new Status.HnswGraphStatus()); + final int numLevels = hnswGraph.numLevels(); + status.hnswGraphsStatusByField.get(fieldName).numNodesAtLevel = + new ArrayList<>(Collections.nCopies(numLevels, null)); + status.hnswGraphsStatusByField.get(fieldName).connectednessAtLevel = + new ArrayList<>(Collections.nCopies(numLevels, null)); + // Perform checks on each level of the HNSW graph + for (int level = numLevels - 1; level >= 0; level--) { + // Collect BitSet of all nodes on this level + BitSet nodesOnThisLevel = new FixedBitSet(hnswGraph.size()); + HnswGraph.NodesIterator nodesIterator = hnswGraph.getNodesOnLevel(level); + while (nodesIterator.hasNext()) { + nodesOnThisLevel.set(nodesIterator.nextInt()); + } + + nodesIterator = hnswGraph.getNodesOnLevel(level); + // Perform checks on each node on the level + while (nodesIterator.hasNext()) { + int node = nodesIterator.nextInt(); + if (node < 0 || node > hnswGraph.size() - 1) { + throw new CheckIndexException( + "Field \"" + + fieldName + + "\" has node: " + + node + + " not in the expected range [0, " + + (hnswGraph.size() - 1) + + "]"); + } + + // Perform checks on the node's neighbors + hnswGraph.seek(level, node); + int nbr, lastNeighbor = -1, firstNeighbor = -1; + while ((nbr = hnswGraph.nextNeighbor()) != NO_MORE_DOCS) { + if (!nodesOnThisLevel.get(nbr)) { + throw new CheckIndexException( + "Field \"" + + fieldName + + "\" has node: " + + node + + " with a neighbor " + + nbr + + " which is not on its level (" + + level + + ")"); + } + if (firstNeighbor == -1) { + firstNeighbor = nbr; + } + if (nbr < lastNeighbor) { + throw new CheckIndexException( + "Field \"" + + fieldName + + "\" has neighbors out of order for node " + + node + + ": " + + nbr + + "<" + + lastNeighbor + + " 1st=" + + firstNeighbor); + } else if (nbr == lastNeighbor) { + throw new CheckIndexException( + "Field \"" + + fieldName + + "\" has repeated neighbors of node " + + node + + " with value " + + nbr); + } + lastNeighbor = nbr; + } + } + int numNodesOnLayer = nodesIterator.size(); + status.hnswGraphsStatusByField.get(fieldName).numNodesAtLevel.set(level, numNodesOnLayer); + + // Evaluate connectedness at this level by measuring the number of nodes reachable from the + // entry point + IntIntHashMap connectedNodes = getConnectedNodesOnLevel(hnswGraph, numNodesOnLayer, level); + status + .hnswGraphsStatusByField + .get(fieldName) + .connectednessAtLevel + .set(level, connectedNodes.size() + "/" + numNodesOnLayer); + } + } + } + + private static IntIntHashMap getConnectedNodesOnLevel( + HnswGraph hnswGraph, int numNodesOnLayer, int level) throws IOException { + IntIntHashMap connectedNodes = new IntIntHashMap(numNodesOnLayer); + int entryPoint = hnswGraph.entryNode(); + Deque stack = new ArrayDeque<>(); + stack.push(entryPoint); + while (!stack.isEmpty()) { + int node = stack.pop(); + if (connectedNodes.containsKey(node)) { + continue; + } + connectedNodes.put(node, 1); + hnswGraph.seek(level, node); + int friendOrd; + while ((friendOrd = hnswGraph.nextNeighbor()) != NO_MORE_DOCS) { + stack.push(friendOrd); + } + } + return connectedNodes; + } + private static boolean vectorsReaderSupportsSearch(CodecReader codecReader, String fieldName) { KnnVectorsReader vectorsReader = codecReader.getVectorReader(); if (vectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader perFieldReader) { From 5fd2e70305cb2ec1db731084254f34042dbf5e71 Mon Sep 17 00:00:00 2001 From: Ben Chaplin Date: Wed, 8 Jan 2025 16:47:54 -0500 Subject: [PATCH 06/88] Add CHANGES entry for CheckIndex HNSW work (#14120) --- lucene/CHANGES.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f3498cd5738e..b7f38f5a688d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -50,6 +50,7 @@ Improvements * GITHUB#14079: Hunspell Dictionary now supports an option to tolerate REP rule count mismatches. (Robert Muir) +* GITHUB#13984: Add HNSW graph checks and stats to CheckIndex Optimizations --------------------- From 0169c1efea0542da4ed644ea4016d9a2c79148ed Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Wed, 8 Jan 2025 17:46:20 -0500 Subject: [PATCH 07/88] Fix test that was implicitly assuming simple writer config that would not rearrange docids (#14122) --- .../org/apache/lucene/misc/index/TestBpVectorReorderer.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java index 7441e68f7d48..bea3812e4171 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java @@ -380,7 +380,7 @@ public void testIndexReorderSparse() throws Exception { int maxDoc = 0; try (Directory dir = newFSDirectory(tmpdir)) { // create an index with a single leaf - try (IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig())) { + try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) { for (float[] vector : vectors) { Document doc = new Document(); if (random().nextBoolean()) { @@ -394,7 +394,6 @@ public void testIndexReorderSparse() throws Exception { writer.addDocument(doc); maxDoc++; } - writer.forceMerge(1); } // reorder using the index reordering tool BpVectorReorderer.main( From 3fad7193378950bcff650f28a55f20f440934946 Mon Sep 17 00:00:00 2001 From: Shubham Sharma Date: Thu, 9 Jan 2025 14:23:54 +0530 Subject: [PATCH 08/88] Updated releaseWizard.py to use timezone-aware objects to represent datetimes in UTC (#14102) Co-authored-by: Shubham Sharma --- dev-tools/scripts/releaseWizard.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/dev-tools/scripts/releaseWizard.py b/dev-tools/scripts/releaseWizard.py index d599095619d4..3814ae38a789 100755 --- a/dev-tools/scripts/releaseWizard.py +++ b/dev-tools/scripts/releaseWizard.py @@ -49,6 +49,7 @@ from collections import OrderedDict from datetime import datetime from datetime import timedelta +from datetime import timezone try: import holidays @@ -99,7 +100,7 @@ def expand_jinja(text, vars=None): 'state': state, 'gpg_key' : state.get_gpg_key(), 'gradle_cmd' : 'gradlew.bat' if is_windows() else './gradlew', - 'epoch': unix_time_millis(datetime.utcnow()), + 'epoch': unix_time_millis(datetime.now(tz=timezone.utc)), 'get_next_version': state.get_next_version(), 'current_git_rev': state.get_current_git_rev(), 'keys_downloaded': keys_downloaded(), @@ -199,7 +200,7 @@ def check_prerequisites(todo=None): return True -epoch = datetime.utcfromtimestamp(0) +epoch = datetime.fromtimestamp(timestamp=0, tz=timezone.utc) def unix_time_millis(dt): @@ -279,7 +280,7 @@ def __init__(self, config_path, release_version, script_version): self.latest_version = None self.previous_rcs = {} self.rc_number = 1 - self.start_date = unix_time_millis(datetime.utcnow()) + self.start_date = unix_time_millis(datetime.now(tz=timezone.utc)) self.script_branch = run("git rev-parse --abbrev-ref HEAD").strip() self.mirrored_versions = None try: @@ -741,7 +742,7 @@ def get_vars(self): def set_done(self, is_done): if is_done: - self.state['done_date'] = unix_time_millis(datetime.utcnow()) + self.state['done_date'] = unix_time_millis(datetime.now(tz=timezone.utc)) if self.persist_vars: for k in self.persist_vars: self.state[k] = self.get_vars()[k] @@ -935,7 +936,7 @@ def expand_multiline(cmd_txt, indent=0): def unix_to_datetime(unix_stamp): - return datetime.utcfromtimestamp(unix_stamp / 1000) + return datetime.fromtimestamp(timestamp=unix_stamp / 1000, tz=timezone.utc) def generate_asciidoc(): @@ -949,7 +950,7 @@ def generate_asciidoc(): fh.write("= Lucene Release %s\n\n" % state.release_version) fh.write("(_Generated by releaseWizard.py v%s at %s_)\n\n" - % (getScriptVersion(), datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"))) + % (getScriptVersion(), datetime.now(tz=timezone.utc).strftime("%Y-%m-%d %H:%M UTC"))) fh.write(":numbered:\n\n") fh.write("%s\n\n" % template('help')) for group in state.todo_groups: @@ -1839,9 +1840,9 @@ def create_ical(todo): # pylint: disable=unused-argument return True -today = datetime.utcnow().date() +today = datetime.now(tz=timezone.utc).date() sundays = {(today + timedelta(days=x)): 'Sunday' for x in range(10) if (today + timedelta(days=x)).weekday() == 6} -y = datetime.utcnow().year +y = datetime.now(tz=timezone.utc).year years = [y, y+1] non_working = holidays.CA(years=years) + holidays.US(years=years) + holidays.UK(years=years) \ + holidays.DE(years=years) + holidays.NO(years=years) + holidays.IN(years=years) + holidays.RU(years=years) @@ -1849,7 +1850,7 @@ def create_ical(todo): # pylint: disable=unused-argument def vote_close_72h_date(): # Voting open at least 72 hours according to ASF policy - return datetime.utcnow() + timedelta(hours=73) + return datetime.now(tz=timezone.utc) + timedelta(hours=73) def vote_close_72h_holidays(): From 2afc0a0f8b8c61fe5d36f705fe27413adbb6962a Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Thu, 9 Jan 2025 07:23:07 -0500 Subject: [PATCH 09/88] Preserve max-conn when merging onto existing graph Fixes gh#14118 (#14121) --- .../lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index ce9ee1b79cc0..a587449e2e7b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -452,11 +452,12 @@ private void writeMeta( meta.writeVLong(vectorIndexLength); meta.writeVInt(field.getVectorDimension()); meta.writeInt(count); - meta.writeVInt(M); // write graph nodes on each level if (graph == null) { + meta.writeVInt(M); meta.writeVInt(0); } else { + meta.writeVInt(graph.maxConn()); meta.writeVInt(graph.numLevels()); long valueCount = 0; for (int level = 0; level < graph.numLevels(); level++) { From b7c7fe064ce10ee6c3780d8f021bc13572e94e55 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Thu, 9 Jan 2025 07:23:22 -0500 Subject: [PATCH 10/88] fix for gh#14110: improve BpVectorReordered heuristic to make it more stable (#14117) --- .../org/apache/lucene/misc/index/BpVectorReorderer.java | 6 ++---- .../org/apache/lucene/misc/index/TestBpVectorReorderer.java | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java index 7facf48580c8..246109ede04c 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BpVectorReorderer.java @@ -311,10 +311,8 @@ private int shuffle( depth, vectorScore) .compute(); - - float scale = - VectorUtil.dotProduct(leftCentroid, leftCentroid) - + VectorUtil.dotProduct(rightCentroid, rightCentroid); + vectorSubtract(leftCentroid, rightCentroid, scratch); + float scale = (float) Math.sqrt(VectorUtil.dotProduct(scratch, scratch)); float maxLeftBias = Float.NEGATIVE_INFINITY; for (int i = ids.offset; i < midPoint; ++i) { maxLeftBias = Math.max(maxLeftBias, biases[i]); diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java index bea3812e4171..e4398a76183a 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java @@ -62,7 +62,7 @@ public void setUp() throws Exception { } private void createQuantizedIndex(Directory dir, List vectors) throws IOException { - IndexWriterConfig cfg = newIndexWriterConfig(); + IndexWriterConfig cfg = new IndexWriterConfig(); cfg.setCodec( new Lucene101Codec() { @Override From 7c642179214e6ba9b9d0e4ace6597d506ce7683b Mon Sep 17 00:00:00 2001 From: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:27:57 +0000 Subject: [PATCH 11/88] DirectIOIndexInput - add overloads for primitive access (#14107) This commit adds overloads for primitive access to DirectIOIndexInput. Existing tests in TestDirectIOIndexInput already provide sufficient coverage for the changes in this PR. --- .../lucene/misc/store/DirectIODirectory.java | 39 ++++++++++++++++++- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java b/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java index ff7ea2341acd..40d43e613a01 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.misc.store; +import static java.nio.ByteOrder.LITTLE_ENDIAN; + import java.io.EOFException; import java.io.IOException; import java.io.UncheckedIOException; @@ -314,7 +316,7 @@ public DirectIOIndexInput(Path path, int blockSize, int bufferSize) throws IOExc this.blockSize = blockSize; this.channel = FileChannel.open(path, StandardOpenOption.READ, getDirectOpenOption()); - this.buffer = ByteBuffer.allocateDirect(bufferSize + blockSize - 1).alignedSlice(blockSize); + this.buffer = allocateBuffer(bufferSize, blockSize); isOpen = true; isClone = false; @@ -329,7 +331,7 @@ private DirectIOIndexInput(DirectIOIndexInput other) throws IOException { this.blockSize = other.blockSize; final int bufferSize = other.buffer.capacity(); - this.buffer = ByteBuffer.allocateDirect(bufferSize + blockSize - 1).alignedSlice(blockSize); + this.buffer = allocateBuffer(bufferSize, blockSize); isOpen = true; isClone = true; @@ -338,6 +340,12 @@ private DirectIOIndexInput(DirectIOIndexInput other) throws IOException { seek(other.getFilePointer()); } + private static ByteBuffer allocateBuffer(int bufferSize, int blockSize) { + return ByteBuffer.allocateDirect(bufferSize + blockSize - 1) + .alignedSlice(blockSize) + .order(LITTLE_ENDIAN); + } + @Override public void close() throws IOException { if (isOpen && !isClone) { @@ -389,6 +397,33 @@ public byte readByte() throws IOException { return buffer.get(); } + @Override + public short readShort() throws IOException { + if (buffer.remaining() >= Short.BYTES) { + return buffer.getShort(); + } else { + return super.readShort(); + } + } + + @Override + public int readInt() throws IOException { + if (buffer.remaining() >= Integer.BYTES) { + return buffer.getInt(); + } else { + return super.readInt(); + } + } + + @Override + public long readLong() throws IOException { + if (buffer.remaining() >= Long.BYTES) { + return buffer.getLong(); + } else { + return super.readLong(); + } + } + private void refill(int bytesToRead) throws IOException { filePos += buffer.capacity(); From 60efc4ad28e2f2686301e6b718ffc78b4befe697 Mon Sep 17 00:00:00 2001 From: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:23:18 +0000 Subject: [PATCH 12/88] DirectIOIndexInput - add overloads for bulk retrieval (#14124) This commit adds overloads for bulk retrieval to DirectIOIndexInput. The implementation of these methods is identical to that of BufferedIndexInput, and it already covered by existing tests. --- .../lucene/misc/store/DirectIODirectory.java | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java b/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java index 40d43e613a01..8b5f4fd76b77 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java @@ -463,6 +463,63 @@ public void readBytes(byte[] dst, int offset, int len) throws IOException { } } + @Override + public void readInts(int[] dst, int offset, int len) throws IOException { + int remainingDst = len; + while (remainingDst > 0) { + int cnt = Math.min(buffer.remaining() / Integer.BYTES, remainingDst); + buffer.asIntBuffer().get(dst, offset + len - remainingDst, cnt); + buffer.position(buffer.position() + Integer.BYTES * cnt); + remainingDst -= cnt; + if (remainingDst > 0) { + if (buffer.hasRemaining()) { + dst[offset + len - remainingDst] = readInt(); + --remainingDst; + } else { + refill(remainingDst * Integer.BYTES); + } + } + } + } + + @Override + public void readFloats(float[] dst, int offset, int len) throws IOException { + int remainingDst = len; + while (remainingDst > 0) { + int cnt = Math.min(buffer.remaining() / Float.BYTES, remainingDst); + buffer.asFloatBuffer().get(dst, offset + len - remainingDst, cnt); + buffer.position(buffer.position() + Float.BYTES * cnt); + remainingDst -= cnt; + if (remainingDst > 0) { + if (buffer.hasRemaining()) { + dst[offset + len - remainingDst] = Float.intBitsToFloat(readInt()); + --remainingDst; + } else { + refill(remainingDst * Float.BYTES); + } + } + } + } + + @Override + public void readLongs(long[] dst, int offset, int len) throws IOException { + int remainingDst = len; + while (remainingDst > 0) { + int cnt = Math.min(buffer.remaining() / Long.BYTES, remainingDst); + buffer.asLongBuffer().get(dst, offset + len - remainingDst, cnt); + buffer.position(buffer.position() + Long.BYTES * cnt); + remainingDst -= cnt; + if (remainingDst > 0) { + if (buffer.hasRemaining()) { + dst[offset + len - remainingDst] = readLong(); + --remainingDst; + } else { + refill(remainingDst * Long.BYTES); + } + } + } + } + @Override public DirectIOIndexInput clone() { try { From 2756cd982c52d20fec8acc8ddf9ce514cbe1d84a Mon Sep 17 00:00:00 2001 From: YeonghyeonKo <46114393+YeonghyeonKO@users.noreply.github.com> Date: Fri, 10 Jan 2025 01:49:59 +0900 Subject: [PATCH 13/88] Fix urls describing why NIOFS is not recommended for Windows (#14081) This patch fixes incorrect URL links in NIOFSDirectory and FSDirectory. --- lucene/CHANGES.txt | 2 +- lucene/core/src/java/org/apache/lucene/store/FSDirectory.java | 2 +- .../core/src/java/org/apache/lucene/store/NIOFSDirectory.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b7f38f5a688d..e6174479ee0a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -68,7 +68,7 @@ Bug Fixes Other --------------------- -(No changes) +* GITHUB#14081: Fix urls describing why NIOFS is not recommended for Windows (Marcel Yeonghyeon Ko) ======================= Lucene 10.1.0 ======================= diff --git a/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java b/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java index 413e22c45ae8..0a49cba05e49 100644 --- a/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java @@ -60,7 +60,7 @@ * post. *
  • {@link NIOFSDirectory} uses java.nio's FileChannel's positional io when reading to avoid * synchronization when reading from the same file. Unfortunately, due to a Windows-only Sun JRE bug this is a + * href="https://bugs.java.com/bugdatabase/view_bug?bug_id=6265734">Sun JRE bug this is a * poor choice for Windows, but on all other platforms this is the preferred choice. * Applications using {@link Thread#interrupt()} or {@link Future#cancel(boolean)} should use * {@code RAFDirectory} instead, which is provided in the {@code misc} module. See {@link diff --git a/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java b/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java index 246f48082cfe..c9c92db91b40 100644 --- a/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java @@ -36,7 +36,7 @@ *

    NOTE: NIOFSDirectory is not recommended on Windows because of a bug in how * FileChannel.read is implemented in Sun's JRE. Inside of the implementation the position is * apparently synchronized. See here for details. + * href="https://bugs.java.com/bugdatabase/view_bug?bug_id=6265734">here for details. * *

    NOTE: Accessing this class either directly or indirectly from a thread while it's * interrupted can close the underlying file descriptor immediately if at the same time the thread From ee65e8f8b141d44ada269d08c897993ec41ffa2f Mon Sep 17 00:00:00 2001 From: Ao Li <5557706+aoli-al@users.noreply.github.com> Date: Thu, 9 Jan 2025 15:28:01 -0500 Subject: [PATCH 14/88] Use CDL to block threads to avoid flaky tests. (#14116) * Use CDL to block threads to avoid flaky tests. * Update CHANGES.txt --- lucene/CHANGES.txt | 3 +++ .../lucene/index/TestConcurrentMergeScheduler.java | 12 ++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e6174479ee0a..c569db7f7ad0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -68,8 +68,11 @@ Bug Fixes Other --------------------- + * GITHUB#14081: Fix urls describing why NIOFS is not recommended for Windows (Marcel Yeonghyeon Ko) +* GITHUB#14116 Use CDL to block threads to avoid flaky tests. (Ao Li) + ======================= Lucene 10.1.0 ======================= API Changes diff --git a/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java b/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java index e0b2c49d8548..fcf42177570e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java @@ -418,7 +418,6 @@ protected void doMerge(MergeSource mergeSource, MergePolicy.OneMerge merge) dir.close(); } - @SuppressForbidden(reason = "Thread sleep") public void testIntraMergeThreadPoolIsLimitedByMaxThreads() throws IOException { ConcurrentMergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); MergeScheduler.MergeSource mergeSource = @@ -475,11 +474,12 @@ public void merge(MergePolicy.OneMerge merge) throws IOException { Executor executor = mergeScheduler.intraMergeExecutor; AtomicInteger threadsExecutedOnPool = new AtomicInteger(); AtomicInteger threadsExecutedOnSelf = new AtomicInteger(); - for (int i = 0; i < 4; i++) { + CountDownLatch latch = new CountDownLatch(1); + final int totalThreads = 4; + for (int i = 0; i < totalThreads; i++) { mergeScheduler.mergeThreads.add( mergeScheduler.new MergeThread(mergeSource, merge) { @Override - @SuppressForbidden(reason = "Thread sleep") public void run() { executor.execute( () -> { @@ -489,7 +489,7 @@ public void run() { threadsExecutedOnPool.incrementAndGet(); } try { - Thread.sleep(100); + latch.await(); } catch (InterruptedException e) { throw new RuntimeException(e); } @@ -500,6 +500,10 @@ public void run() { for (ConcurrentMergeScheduler.MergeThread thread : mergeScheduler.mergeThreads) { thread.start(); } + while (threadsExecutedOnSelf.get() + threadsExecutedOnPool.get() < totalThreads) { + Thread.yield(); + } + latch.countDown(); mergeScheduler.sync(); assertEquals(3, threadsExecutedOnSelf.get()); assertEquals(1, threadsExecutedOnPool.get()); From 1778377176b901d70d994b71de30839acae224cb Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Thu, 9 Jan 2025 15:59:44 -0500 Subject: [PATCH 15/88] fix gh-14123: Add null checks to SortingCodecReader (#14125) --- lucene/CHANGES.txt | 2 + .../lucene/index/SortingCodecReader.java | 22 ++++++++++ .../lucene/index/TestSortingCodecReader.java | 40 +++++++++++++------ 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c569db7f7ad0..0e7c6b13ee5b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -66,6 +66,8 @@ Bug Fixes * GITHUB#14109: prefetch may select the wrong memory segment for multi-segment slices. (Chris Hegarty) +* GITHUB#14123: SortingCodecReader NPE when segment has no (points, vectors, etc...) (Mike Sokolov) + Other --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index daec0c197d6a..ab9964026ad8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -314,6 +314,7 @@ private static class SortingFloatVectorValues extends FloatVectorValues { SortingFloatVectorValues(FloatVectorValues delegate, Sorter.DocMap sortMap) throws IOException { this.delegate = delegate; + assert delegate != null; // SortingValuesIterator consumes the iterator and records the docs and ord mapping iteratorSupplier = iteratorSupplier(delegate, sortMap); } @@ -446,6 +447,9 @@ private SortingCodecReader( @Override public FieldsProducer getPostingsReader() { FieldsProducer postingsReader = in.getPostingsReader(); + if (postingsReader == null) { + return null; + } return new FieldsProducer() { @Override public void close() throws IOException { @@ -481,6 +485,9 @@ public int size() { @Override public StoredFieldsReader getFieldsReader() { StoredFieldsReader delegate = in.getFieldsReader(); + if (delegate == null) { + return null; + } return newStoredFieldsReader(delegate); } @@ -526,6 +533,9 @@ public Bits getLiveDocs() { @Override public PointsReader getPointsReader() { final PointsReader delegate = in.getPointsReader(); + if (delegate == null) { + return null; + } return new PointsReader() { @Override public void checkIntegrity() throws IOException { @@ -551,6 +561,9 @@ public void close() throws IOException { @Override public KnnVectorsReader getVectorReader() { KnnVectorsReader delegate = in.getVectorReader(); + if (delegate == null) { + return null; + } return new KnnVectorsReader() { @Override public void checkIntegrity() throws IOException { @@ -587,6 +600,9 @@ public void close() throws IOException { @Override public NormsProducer getNormsReader() { final NormsProducer delegate = in.getNormsReader(); + if (delegate == null) { + return null; + } return new NormsProducer() { @Override public NumericDocValues getNorms(FieldInfo field) throws IOException { @@ -609,6 +625,9 @@ public void close() throws IOException { @Override public DocValuesProducer getDocValuesReader() { final DocValuesProducer delegate = in.getDocValuesReader(); + if (delegate == null) { + return null; + } return new DocValuesProducer() { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { @@ -710,6 +729,9 @@ public TermVectorsReader getTermVectorsReader() { } private TermVectorsReader newTermVectorsReader(TermVectorsReader delegate) { + if (delegate == null) { + return null; + } return new TermVectorsReader() { @Override public void prefetch(int doc) throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java index 8039d8b8f6fb..285296d55c19 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java @@ -25,6 +25,7 @@ import java.util.Collections; import java.util.List; import java.util.Locale; +import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.hnsw.HnswGraphProvider; import org.apache.lucene.document.BinaryDocValuesField; @@ -153,14 +154,16 @@ public void testSortOnAddIndicesRandom() throws IOException { docIds.add(i); } Collections.shuffle(docIds, random()); - // If true, index a vector for every doc - boolean denseVectors = random().nextBoolean(); + // If true, index a vector and points for every doc + boolean dense = random().nextBoolean(); try (RandomIndexWriter iw = new RandomIndexWriter(random(), dir)) { for (int i = 0; i < numDocs; i++) { int docId = docIds.get(i); Document doc = new Document(); doc.add(new StringField("string_id", Integer.toString(docId), Field.Store.YES)); - doc.add(new LongPoint("point_id", docId)); + if (dense || docId % 3 == 0) { + doc.add(new LongPoint("point_id", docId)); + } String s = RandomStrings.randomRealisticUnicodeOfLength(random(), 25); doc.add(new TextField("text_field", s, Field.Store.YES)); doc.add(new BinaryDocValuesField("text_field", new BytesRef(s))); @@ -172,7 +175,7 @@ public void testSortOnAddIndicesRandom() throws IOException { doc.add(new BinaryDocValuesField("binary_dv", new BytesRef(Integer.toString(docId)))); doc.add( new SortedSetDocValuesField("sorted_set_dv", new BytesRef(Integer.toString(docId)))); - if (denseVectors || docId % 2 == 0) { + if (dense || docId % 2 == 0) { doc.add(new KnnFloatVectorField("vector", new float[] {(float) docId})); } doc.add(new NumericDocValuesField("foo", random().nextInt(20))); @@ -245,8 +248,13 @@ public void testSortOnAddIndicesRandom() throws IOException { SortedSetDocValues sorted_set_dv = leaf.getSortedSetDocValues("sorted_set_dv"); SortedDocValues binary_sorted_dv = leaf.getSortedDocValues("binary_sorted_dv"); FloatVectorValues vectorValues = leaf.getFloatVectorValues("vector"); - HnswGraph graph = - ((HnswGraphProvider) ((CodecReader) leaf).getVectorReader()).getGraph("vector"); + KnnVectorsReader vectorsReader = ((CodecReader) leaf).getVectorReader(); + HnswGraph graph; + if (vectorsReader instanceof HnswGraphProvider hnswGraphProvider) { + graph = hnswGraphProvider.getGraph("vector"); + } else { + graph = null; + } NumericDocValues ids = leaf.getNumericDocValues("id"); long prevValue = -1; boolean usingAltIds = false; @@ -272,10 +280,12 @@ public void testSortOnAddIndicesRandom() throws IOException { assertTrue(sorted_numeric_dv.advanceExact(idNext)); assertTrue(sorted_set_dv.advanceExact(idNext)); assertTrue(binary_sorted_dv.advanceExact(idNext)); - if (denseVectors || prevValue % 2 == 0) { + if (dense || prevValue % 2 == 0) { assertEquals(idNext, valuesIterator.advance(idNext)); - graph.seek(0, valuesIterator.index()); - assertNotEquals(DocIdSetIterator.NO_MORE_DOCS, graph.nextNeighbor()); + if (graph != null) { + graph.seek(0, valuesIterator.index()); + assertNotEquals(DocIdSetIterator.NO_MORE_DOCS, graph.nextNeighbor()); + } } assertEquals(new BytesRef(ids.longValue() + ""), binary_dv.binaryValue()); @@ -289,7 +299,7 @@ public void testSortOnAddIndicesRandom() throws IOException { assertEquals(1, sorted_numeric_dv.docValueCount()); assertEquals(ids.longValue(), sorted_numeric_dv.nextValue()); - if (denseVectors || prevValue % 2 == 0) { + if (dense || prevValue % 2 == 0) { float[] vectorValue = vectorValues.vectorValue(valuesIterator.index()); assertEquals(1, vectorValue.length); assertEquals((float) ids.longValue(), vectorValue[0], 0.001f); @@ -306,9 +316,13 @@ public void testSortOnAddIndicesRandom() throws IOException { leaf.storedFields().document(idNext).get("string_id")); IndexSearcher searcher = new IndexSearcher(r); TopDocs result = - searcher.search(LongPoint.newExactQuery("point_id", ids.longValue()), 1); - assertEquals(1, result.totalHits.value()); - assertEquals(idNext, result.scoreDocs[0].doc); + searcher.search(LongPoint.newExactQuery("point_id", ids.longValue()), 10); + if (dense || ids.longValue() % 3 == 0) { + assertEquals(1, result.totalHits.value()); + assertEquals(idNext, result.scoreDocs[0].doc); + } else { + assertEquals(0, result.totalHits.value()); + } result = searcher.search(new TermQuery(new Term("string_id", "" + ids.longValue())), 1); From 6f9702e6f9ca18fb258b451bd91ff28077d76a3c Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Fri, 10 Jan 2025 12:14:59 +0530 Subject: [PATCH 16/88] Removing unnecessary ByteArrayDataInput allocations by resetting inplace (#14113) Removing unnecessary ByteArrayDataInput allocations by resetting inplace Signed-off-by: Ankit Jain --- lucene/CHANGES.txt | 2 ++ .../codecs/lucene90/Lucene90DocValuesProducer.java | 9 ++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0e7c6b13ee5b..2486f9e7d390 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -52,6 +52,8 @@ Improvements (Robert Muir) * GITHUB#13984: Add HNSW graph checks and stats to CheckIndex +* GITHUB#14113: Remove unnecessary ByteArrayDataInput allocations from `Lucene90DocValuesProducer$TermsDict.decompressBlock`. (Ankit Jain) + Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java index 11e83b3f03c1..80dffb7b9708 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java @@ -1122,10 +1122,9 @@ private class TermsDict extends BaseTermsEnum { final LongValues indexAddresses; final RandomAccessInput indexBytes; final BytesRef term; + final BytesRef blockBuffer; + final ByteArrayDataInput blockInput; long ord = -1; - - BytesRef blockBuffer = null; - ByteArrayDataInput blockInput = null; long currentCompressedBlockStart = -1; long currentCompressedBlockEnd = -1; @@ -1149,6 +1148,7 @@ private class TermsDict extends BaseTermsEnum { // add 7 padding bytes can help decompression run faster. int bufferSize = entry.maxBlockLength + entry.maxTermLength + LZ4_DECOMPRESSOR_PADDING; blockBuffer = new BytesRef(new byte[bufferSize], 0, bufferSize); + blockInput = new ByteArrayDataInput(); } @Override @@ -1324,8 +1324,7 @@ private void decompressBlock() throws IOException { } // Reset the buffer. - blockInput = - new ByteArrayDataInput(blockBuffer.bytes, blockBuffer.offset, blockBuffer.length); + blockInput.reset(blockBuffer.bytes, blockBuffer.offset, blockBuffer.length); } } From c20e09e62f49943821396d729bbe0e9adc020121 Mon Sep 17 00:00:00 2001 From: Lu Xugang Date: Fri, 10 Jan 2025 16:13:17 +0800 Subject: [PATCH 17/88] Cover all DataType (#14091) Cover all DataType --- lucene/CHANGES.txt | 2 ++ .../src/test/org/apache/lucene/util/packed/TestPackedInts.java | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2486f9e7d390..4c1aac688497 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -77,6 +77,8 @@ Other * GITHUB#14116 Use CDL to block threads to avoid flaky tests. (Ao Li) +* GITHUB#14091: Cover all DataType. (Lu Xugang) + ======================= Lucene 10.1.0 ======================= API Changes diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java index e87f708c8d22..b114070ba9c0 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java @@ -986,7 +986,7 @@ public void testPackedLongValues() { new long[RandomNumbers.randomIntBetween(random(), 1, TEST_NIGHTLY ? 1000000 : 10000)]; float[] ratioOptions = new float[] {PackedInts.DEFAULT, PackedInts.COMPACT, PackedInts.FAST}; for (int bpv : new int[] {0, 1, 63, 64, RandomNumbers.randomIntBetween(random(), 2, 62)}) { - for (DataType dataType : Arrays.asList(DataType.DELTA_PACKED)) { + for (DataType dataType : DataType.values()) { final int pageSize = 1 << TestUtil.nextInt(random(), 6, 20); float acceptableOverheadRatio = ratioOptions[TestUtil.nextInt(random(), 0, ratioOptions.length - 1)]; From 9f0d3dd5c05bc37f71c499ab9c79c43cfd2b0bf4 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Fri, 10 Jan 2025 19:39:41 +0530 Subject: [PATCH 18/88] Fixing precommit, ECJ, Rat, spotless, forbiddenApis etc. --- lucene/licenses/commons-LICENSE-ASL.txt | 202 ++++++++++++ lucene/licenses/commons-NOTICE.txt | 197 +++++++++++ lucene/licenses/commons-lang3-3.17.0.jar.sha1 | 1 + lucene/licenses/cuvs-java-25.02.jar.sha1 | 1 + lucene/licenses/cuvs-java-LICENSE-ASL.txt | 202 ++++++++++++ lucene/licenses/cuvs-java-NOTICE.txt | 197 +++++++++++ lucene/sandbox/src/java/module-info.java | 7 +- .../vectorsearch/CagraFieldVectorsWriter.java | 26 +- .../sandbox/vectorsearch/CuVSCodec.java | 34 +- .../sandbox/vectorsearch/CuVSIndex.java | 36 +- .../vectorsearch/CuVSKnnFloatVectorQuery.java | 29 +- .../sandbox/vectorsearch/CuVSSegmentFile.java | 31 +- .../vectorsearch/CuVSVectorsFormat.java | 38 ++- .../vectorsearch/CuVSVectorsReader.java | 268 +++++++++------ .../vectorsearch/CuVSVectorsWriter.java | 307 +++++++++++------- .../vectorsearch/PerLeafCuVSKnnCollector.java | 23 +- .../vectorsearch/SegmentInputStream.java | 28 +- .../lucene/sandbox/vectorsearch/Util.java | 114 ++----- .../sandbox/vectorsearch/package-info.java | 16 + .../services/org.apache.lucene.codecs.Codec | 15 + .../lucene/sandbox/vectorsearch/TestCuVS.java | 95 +++--- versions.lock | 68 ++++ versions.toml | 2 - 23 files changed, 1525 insertions(+), 412 deletions(-) create mode 100644 lucene/licenses/commons-LICENSE-ASL.txt create mode 100644 lucene/licenses/commons-NOTICE.txt create mode 100644 lucene/licenses/commons-lang3-3.17.0.jar.sha1 create mode 100644 lucene/licenses/cuvs-java-25.02.jar.sha1 create mode 100644 lucene/licenses/cuvs-java-LICENSE-ASL.txt create mode 100644 lucene/licenses/cuvs-java-NOTICE.txt diff --git a/lucene/licenses/commons-LICENSE-ASL.txt b/lucene/licenses/commons-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/commons-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/commons-NOTICE.txt b/lucene/licenses/commons-NOTICE.txt new file mode 100644 index 000000000000..554991d39bcf --- /dev/null +++ b/lucene/licenses/commons-NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/licenses/commons-lang3-3.17.0.jar.sha1 b/lucene/licenses/commons-lang3-3.17.0.jar.sha1 new file mode 100644 index 000000000000..f64174593b1c --- /dev/null +++ b/lucene/licenses/commons-lang3-3.17.0.jar.sha1 @@ -0,0 +1 @@ +b17d2136f0460dcc0d2016ceefca8723bdf4ee70 diff --git a/lucene/licenses/cuvs-java-25.02.jar.sha1 b/lucene/licenses/cuvs-java-25.02.jar.sha1 new file mode 100644 index 000000000000..e399aed842a5 --- /dev/null +++ b/lucene/licenses/cuvs-java-25.02.jar.sha1 @@ -0,0 +1 @@ +280c6f97d99a8d32500a0c0891db1ccdc49bc17b diff --git a/lucene/licenses/cuvs-java-LICENSE-ASL.txt b/lucene/licenses/cuvs-java-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/cuvs-java-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/cuvs-java-NOTICE.txt b/lucene/licenses/cuvs-java-NOTICE.txt new file mode 100644 index 000000000000..554991d39bcf --- /dev/null +++ b/lucene/licenses/cuvs-java-NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index b2d45adf4d30..051c1df0a257 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -23,7 +23,7 @@ requires java.logging; requires com.nvidia.cuvs; requires org.apache.commons.lang3; - + exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; exports org.apache.lucene.sandbox.codecs.quantization; @@ -37,7 +37,12 @@ exports org.apache.lucene.sandbox.facet.iterators; exports org.apache.lucene.sandbox.facet.cutters; exports org.apache.lucene.sandbox.facet.labels; + exports org.apache.lucene.sandbox.vectorsearch; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; + // provides org.apache.lucene.codecs.KnnVectorsFormat with + // org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; + provides org.apache.lucene.codecs.Codec with + org.apache.lucene.sandbox.vectorsearch.CuVSCodec; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index 21c088bd84f8..df8f83966dc3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -1,15 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import java.io.IOException; +import java.nio.charset.Charset; import java.util.concurrent.ConcurrentHashMap; - import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.index.FieldInfo; public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { public final String fieldName; - public final ConcurrentHashMap vectors = new ConcurrentHashMap(); + public final ConcurrentHashMap vectors = + new ConcurrentHashMap(); public int fieldVectorDimension = -1; public CagraFieldVectorsWriter(FieldInfo fieldInfo) { @@ -19,7 +36,9 @@ public CagraFieldVectorsWriter(FieldInfo fieldInfo) { @Override public long ramBytesUsed() { - return fieldName.getBytes().length + Integer.BYTES + (vectors.size() * fieldVectorDimension * Float.BYTES); + return fieldName.getBytes(Charset.forName("UTF-8")).length + + Integer.BYTES + + (vectors.size() * fieldVectorDimension * Float.BYTES); } @Override @@ -31,5 +50,4 @@ public void addValue(int docID, float[] vectorValue) throws IOException { public float[] copyValue(float[] vectorValue) { throw new UnsupportedOperationException(); } - } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index 448803bb7fc4..315923d1eeb2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -1,12 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; +import com.nvidia.cuvs.LibraryNotFoundException; +import java.util.logging.Logger; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; - public class CuVSCodec extends FilterCodec { public CuVSCodec() { @@ -15,17 +32,24 @@ public CuVSCodec() { public CuVSCodec(String name, Codec delegate) { super(name, delegate); - setKnnFormat(new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE)); + KnnVectorsFormat format; + try { + format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE); + setKnnFormat(format); + } catch (LibraryNotFoundException ex) { + Logger log = Logger.getLogger(CuVSCodec.class.getName()); + log.severe("Couldn't load native library, possible classloader issue. " + ex.getMessage()); + } } - + KnnVectorsFormat knnFormat = null; @Override public KnnVectorsFormat knnVectorsFormat() { return knnFormat; } - + public void setKnnFormat(KnnVectorsFormat format) { this.knnFormat = format; } -} \ No newline at end of file +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 1878b6c236bc..98a2eb9739ac 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -1,10 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; -import java.util.List; -import java.util.Objects; - import com.nvidia.cuvs.BruteForceIndex; import com.nvidia.cuvs.CagraIndex; +import java.util.List; +import java.util.Objects; public class CuVSIndex { private final CagraIndex cagraIndex; @@ -12,11 +27,18 @@ public class CuVSIndex { private final List mapping; private final List vectors; private final int maxDocs; - + private final String fieldName; private final String segmentName; - public CuVSIndex(String segmentName, String fieldName, CagraIndex cagraIndex, List mapping, List vectors, int maxDocs, BruteForceIndex bruteforceIndex) { + public CuVSIndex( + String segmentName, + String fieldName, + CagraIndex cagraIndex, + List mapping, + List vectors, + int maxDocs, + BruteForceIndex bruteforceIndex) { this.cagraIndex = Objects.requireNonNull(cagraIndex); this.bruteforceIndex = Objects.requireNonNull(bruteforceIndex); this.mapping = Objects.requireNonNull(mapping); @@ -25,7 +47,7 @@ public CuVSIndex(String segmentName, String fieldName, CagraIndex cagraIndex, Li this.segmentName = Objects.requireNonNull(segmentName); this.maxDocs = Objects.requireNonNull(maxDocs); } - + public CagraIndex getCagraIndex() { return cagraIndex; } @@ -53,4 +75,4 @@ public String getSegmentName() { public int getMaxDocs() { return maxDocs; } -} \ No newline at end of file +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java index 1bbae88c5630..e4df14208f97 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -1,7 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import java.io.IOException; - import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.KnnFloatVectorQuery; @@ -11,8 +26,8 @@ public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { - final private int iTopK; - final private int searchWidth; + private final int iTopK; + private final int searchWidth; public CuVSKnnFloatVectorQuery(String field, float[] target, int k, int iTopK, int searchWidth) { super(field, target, k); @@ -21,7 +36,12 @@ public CuVSKnnFloatVectorQuery(String field, float[] target, int k, int iTopK, i } @Override - protected TopDocs approximateSearch(LeafReaderContext context, Bits acceptDocs, int visitedLimit, KnnCollectorManager knnCollectorManager) throws IOException { + protected TopDocs approximateSearch( + LeafReaderContext context, + Bits acceptDocs, + int visitedLimit, + KnnCollectorManager knnCollectorManager) + throws IOException { PerLeafCuVSKnnCollector results = new PerLeafCuVSKnnCollector(k, iTopK, searchWidth); @@ -29,5 +49,4 @@ protected TopDocs approximateSearch(LeafReaderContext context, Bits acceptDocs, reader.searchNearestVectors(field, this.getTargetCopy(), results, null); return results.topDocs(); } - } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java index 9ca0d63ba087..7b850daa6662 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -1,6 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; -import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.util.Collections; @@ -11,8 +26,8 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -public class CuVSSegmentFile implements AutoCloseable{ - final private ZipOutputStream zos; +public class CuVSSegmentFile implements AutoCloseable { + private final ZipOutputStream zos; private Set filesAdded = new HashSet(); @@ -20,18 +35,22 @@ public CuVSSegmentFile(OutputStream out) { zos = new ZipOutputStream(out); zos.setLevel(Deflater.NO_COMPRESSION); } - + protected Logger log = Logger.getLogger(getClass().getName()); public void addFile(String name, byte[] bytes) throws IOException { - log.info("Writing the file: " + name + ", size="+bytes.length + ", space remaining: "+new File("/").getFreeSpace()); + /*log.info( + "Writing the file: " + + name + + ", size=" + + bytes.length);*/ ZipEntry indexFileZipEntry = new ZipEntry(name); zos.putNextEntry(indexFileZipEntry); zos.write(bytes, 0, bytes.length); zos.closeEntry(); filesAdded.add(name); } - + public Set getFilesAdded() { return Collections.unmodifiableSet(filesAdded); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index c17b5258c9d5..e2b5bc2169f5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -1,14 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.LibraryNotFoundException; import java.io.IOException; - import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; -import com.nvidia.cuvs.CuVSResources; - public class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; @@ -30,11 +45,13 @@ public CuVSVectorsFormat() { try { resources = new CuVSResources(); } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } } - public CuVSVectorsFormat(int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) { + public CuVSVectorsFormat( + int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) + throws LibraryNotFoundException { super("CuVSVectorsFormat"); this.mergeStrategy = mergeStrategy; this.cuvsWriterThreads = cuvsWriterThreads; @@ -42,14 +59,17 @@ public CuVSVectorsFormat(int cuvsWriterThreads, int intGraphDegree, int graphDeg this.graphDegree = graphDegree; try { resources = new CuVSResources(); + } catch (LibraryNotFoundException ex) { + throw ex; } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } } @Override public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { - return new CuVSVectorsWriter(state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); + return new CuVSVectorsWriter( + state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); } @Override @@ -57,14 +77,12 @@ public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException try { return new CuVSVectorsReader(state, resources); } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } - return null; } @Override public int getMaxDimensions(String fieldName) { return maxDimensions; } - } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index 837a9229d061..d7e8a5f19b08 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -1,28 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceQuery; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraQuery; +import com.nvidia.cuvs.CagraSearchParams; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.HnswIndexParams; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.lang.StackWalker.StackFrame; -import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.logging.Logger; import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; - import org.apache.commons.lang3.SerializationUtils; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.TopKnnCollector; @@ -31,18 +51,9 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; -import com.nvidia.cuvs.BruteForceIndex; -import com.nvidia.cuvs.BruteForceQuery; -import com.nvidia.cuvs.CagraIndex; -import com.nvidia.cuvs.CagraQuery; -import com.nvidia.cuvs.CagraSearchParams; -import com.nvidia.cuvs.CuVSResources; -import com.nvidia.cuvs.HnswIndex; -import com.nvidia.cuvs.HnswIndexParams; - public class CuVSVectorsReader extends KnnVectorsReader { - protected Logger log = Logger.getLogger(getClass().getName()); + // protected Logger log = Logger.getLogger(getClass().getName()); IndexInput vectorDataReader = null; public String fileName = null; @@ -53,7 +64,7 @@ public class CuVSVectorsReader extends KnnVectorsReader { public int indexFilePayloadSize = 0; public long initialFilePointerLoc = 0; public SegmentInputStream segmentInputStream; - + // Field to List of Indexes public Map> cuvsIndexes; @@ -64,17 +75,21 @@ public CuVSVectorsReader(SegmentReadState state, CuVSResources resources) throws segmentState = state; this.resources = resources; - fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, - CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + fileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, CuVSVectorsFormat.VECTOR_DATA_EXTENSION); vectorDataReader = segmentState.directory.openInput(fileName, segmentState.context); CodecUtil.readIndexHeader(vectorDataReader); initialFilePointerLoc = vectorDataReader.getFilePointer(); - indexFilePayloadSize = (int)vectorDataReader.length() - (int)initialFilePointerLoc; //vectorMetaReader.readInt(); - segmentInputStream = new SegmentInputStream(vectorDataReader, indexFilePayloadSize, initialFilePointerLoc); - log.info("payloadSize: " + indexFilePayloadSize); - log.info("initialFilePointerLoc: " + initialFilePointerLoc); + indexFilePayloadSize = + (int) vectorDataReader.length() + - (int) initialFilePointerLoc; // vectorMetaReader.readInt(); + segmentInputStream = + new SegmentInputStream(vectorDataReader, indexFilePayloadSize, initialFilePointerLoc); + // log.info("payloadSize: " + indexFilePayloadSize); + // log.info("initialFilePointerLoc: " + initialFilePointerLoc); List stackTrace = StackWalker.getInstance().walk(this::getStackTrace); @@ -82,36 +97,39 @@ public CuVSVectorsReader(SegmentReadState state, CuVSResources resources) throws for (StackFrame s : stackTrace) { if (s.toString().startsWith("org.apache.lucene.index.IndexWriter.merge")) { isMergeCase = true; - log.info("Reader opening on merge call"); + // log.info("Reader opening on merge call"); break; } } - - log.info("Source of this segment "+segmentState.segmentSuffix+" is " + segmentState.segmentInfo.getDiagnostics().get(IndexWriter.SOURCE)); + + /*log.info( + "Source of this segment " + + segmentState.segmentSuffix + + " is " + + segmentState.segmentInfo.getDiagnostics().get(IndexWriter.SOURCE)); log.info("Loading for " + segmentState.segmentInfo.name + ", mergeCase? " + isMergeCase); - //if (!isMergeCase) { nocommit: TODO: don't load the cagra index for merge case. - log.info("Not the merge case, hence loading for " + segmentState.segmentInfo.name); - this.cuvsIndexes = loadCuVSIndex(getIndexInputStream(), isMergeCase); - //} + log.info("Not the merge case, hence loading for " + segmentState.segmentInfo.name);*/ + this.cuvsIndexes = loadCuVSIndex(getIndexInputStream(), isMergeCase); } - + @SuppressWarnings({"unchecked"}) - private Map> loadCuVSIndex(ZipInputStream zis, boolean isMergeCase) throws Throwable { + private Map> loadCuVSIndex(ZipInputStream zis, boolean isMergeCase) + throws Throwable { Map> ret = new HashMap>(); Map cagraIndexes = new HashMap(); Map bruteforceIndexes = new HashMap(); Map hnswIndexes = new HashMap(); Map> mappings = new HashMap>(); Map> vectors = new HashMap>(); - + Map maxDocs = null; // map of segment, maxDocs ZipEntry ze; while ((ze = zis.getNextEntry()) != null) { String entry = ze.getName(); - + String segmentField = entry.split("\\.")[0]; String extension = entry.split("\\.")[1]; - + ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int len = 0; @@ -120,55 +138,76 @@ private Map> loadCuVSIndex(ZipInputStream zis, boolean i } switch (extension) { - case "meta": { - maxDocs = (Map) SerializationUtils.deserialize(baos.toByteArray()); // nocommit use IOUtils - break; - } - case "vec": { - vectors.put(segmentField, (List) SerializationUtils.deserialize(baos.toByteArray())); // nocommit use IOUtils - break; - } - case "map": { - List map = (List) SerializationUtils.deserialize(baos.toByteArray()); // nocommit use IOUtils - mappings.put(segmentField, map); - break; - } - case "cag": { - cagraIndexes.put(segmentField, new CagraIndex.Builder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .build()); - break; - } - case "bf": { - bruteforceIndexes.put(segmentField, new BruteForceIndex.Builder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .build()); - break; - } - case "hnsw": { - HnswIndexParams indexParams = new HnswIndexParams.Builder(resources) - .build(); - hnswIndexes.put(segmentField, new HnswIndex.Builder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .withIndexParams(indexParams) - .build()); - break; - } + case "meta": + { + maxDocs = (Map) SerializationUtils.deserialize(baos.toByteArray()); + break; + } + case "vec": + { + vectors.put( + segmentField, (List) SerializationUtils.deserialize(baos.toByteArray())); + break; + } + case "map": + { + List map = (List) SerializationUtils.deserialize(baos.toByteArray()); + mappings.put(segmentField, map); + break; + } + case "cag": + { + cagraIndexes.put( + segmentField, + new CagraIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .build()); + break; + } + case "bf": + { + bruteforceIndexes.put( + segmentField, + new BruteForceIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .build()); + break; + } + case "hnsw": + { + HnswIndexParams indexParams = new HnswIndexParams.Builder(resources).build(); + hnswIndexes.put( + segmentField, + new HnswIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .withIndexParams(indexParams) + .build()); + break; + } } } - log.info("Loading cuvsIndexes from segment: " + segmentState.segmentInfo.name); + /*log.info("Loading cuvsIndexes from segment: " + segmentState.segmentInfo.name); log.info("Diagnostics for this segment: " + segmentState.segmentInfo.getDiagnostics()); log.info("Loading map of cagraIndexes: " + cagraIndexes); log.info("Loading vectors: " + vectors); - log.info("Loading mapping: " + mappings); + log.info("Loading mapping: " + mappings);*/ - for (String segmentField: cagraIndexes.keySet()) { - log.info("Loading segmentField: " + segmentField); + for (String segmentField : cagraIndexes.keySet()) { + // log.info("Loading segmentField: " + segmentField); String segment = segmentField.split("/")[0]; String field = segmentField.split("/")[1]; - CuVSIndex cuvsIndex = new CuVSIndex(segment, field, cagraIndexes.get(segmentField), mappings.get(segmentField), vectors.get(segmentField), maxDocs.get(segment), bruteforceIndexes.get(segmentField)); - List listOfIndexes = ret.containsKey(field)? ret.get(field): new ArrayList(); + CuVSIndex cuvsIndex = + new CuVSIndex( + segment, + field, + cagraIndexes.get(segmentField), + mappings.get(segmentField), + vectors.get(segmentField), + maxDocs.get(segment), + bruteforceIndexes.get(segmentField)); + List listOfIndexes = + ret.containsKey(field) ? ret.get(field) : new ArrayList(); listOfIndexes.add(cuvsIndex); ret.put(field, listOfIndexes); } @@ -197,22 +236,22 @@ public void checkIntegrity() throws IOException { @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { return new FloatVectorValues() { - + @Override public int size() { return cuvsIndexes.get(field).get(0).getVectors().size(); } - + @Override public int dimension() { return cuvsIndexes.get(field).get(0).getVectors().get(0).length; } - + @Override public float[] vectorValue(int pos) throws IOException { return cuvsIndexes.get(field).get(0).getVectors().get(pos); } - + @Override public FloatVectorValues copy() throws IOException { return null; @@ -226,46 +265,60 @@ public ByteVectorValues getByteVectorValues(String field) throws IOException { } @Override - public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - PerLeafCuVSKnnCollector cuvsCollector = knnCollector instanceof PerLeafCuVSKnnCollector? ((PerLeafCuVSKnnCollector)knnCollector): new PerLeafCuVSKnnCollector(knnCollector.k(), knnCollector.k(), 1); - TopKnnCollector defaultCollector = knnCollector instanceof TopKnnCollector? ((TopKnnCollector)knnCollector): null; + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + PerLeafCuVSKnnCollector cuvsCollector = + knnCollector instanceof PerLeafCuVSKnnCollector + ? ((PerLeafCuVSKnnCollector) knnCollector) + : new PerLeafCuVSKnnCollector(knnCollector.k(), knnCollector.k(), 1); + TopKnnCollector defaultCollector = + knnCollector instanceof TopKnnCollector ? ((TopKnnCollector) knnCollector) : null; int prevDocCount = 0; - // log.debug("Will try to search all the indexes for segment "+segmentState.segmentInfo.name+", field "+field+": "+cuvsIndexes); - for (CuVSIndex cuvsIndex: cuvsIndexes.get(field)) { + // log.debug("Will try to search all the indexes for segment "+segmentState.segmentInfo.name+", + // field "+field+": "+cuvsIndexes); + for (CuVSIndex cuvsIndex : cuvsIndexes.get(field)) { try { Map result = new HashMap(); if (cuvsCollector.k() <= 1024) { - CagraSearchParams searchParams = new CagraSearchParams.Builder(resources) - .withItopkSize(cuvsCollector.iTopK) - .withSearchWidth(cuvsCollector.searchWidth) - .build(); - - CagraQuery query = new CagraQuery.Builder() - .withTopK(cuvsCollector.k()) - .withSearchParams(searchParams) - .withMapping(cuvsIndex.getMapping()) - .withQueryVectors(new float[][] {target}) - .build(); - + CagraSearchParams searchParams = + new CagraSearchParams.Builder(resources) + .withItopkSize(cuvsCollector.iTopK) + .withSearchWidth(cuvsCollector.searchWidth) + .build(); + + CagraQuery query = + new CagraQuery.Builder() + .withTopK(cuvsCollector.k()) + .withSearchParams(searchParams) + .withMapping(cuvsIndex.getMapping()) + .withQueryVectors(new float[][] {target}) + .build(); + CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); assert (cagraIndex != null); - log.info("k is " + cuvsCollector.k()); - result = cagraIndex.search(query).getResults().get(0); // List expected to have only one entry because of single query "target". - log.info("INTERMEDIATE RESULT FROM CUVS: " + result + ", prevDocCount=" + prevDocCount); + // log.info("k is " + cuvsCollector.k()); + result = + cagraIndex + .search(query) + .getResults() + .get(0); // List expected to have only one entry because of single query "target". + // log.info("INTERMEDIATE RESULT FROM CUVS: " + result + ", prevDocCount=" + + // prevDocCount); } else { - BruteForceQuery bruteforceQuery = new BruteForceQuery.Builder() - .withQueryVectors(new float[][] { target }) - .withPrefilter(((FixedBitSet)acceptDocs).getBits()) - .withTopK(cuvsCollector.k()) - .build(); + BruteForceQuery bruteforceQuery = + new BruteForceQuery.Builder() + .withQueryVectors(new float[][] {target}) + .withPrefilter(((FixedBitSet) acceptDocs).getBits()) + .withTopK(cuvsCollector.k()) + .build(); BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); result = bruteforceIndex.search(bruteforceQuery).getResults().get(0); } - - for(Entry kv : result.entrySet()) { + + for (Entry kv : result.entrySet()) { if (defaultCollector != null) { defaultCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); } @@ -273,14 +326,15 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits } } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } prevDocCount += cuvsIndex.getMaxDocs(); } } - + @Override - public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { throw new UnsupportedOperationException(); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 1da7ca0f9e6c..d5c155ca7212 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -1,16 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceIndexParams; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraIndexParams; +import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; +import com.nvidia.cuvs.CuVSResources; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.OutputStream; -import java.lang.invoke.MethodHandles; import java.util.ArrayList; -import java.util.Arrays; import java.util.LinkedHashMap; import java.util.List; -import java.util.logging.Logger; - import org.apache.commons.lang3.SerializationUtils; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; @@ -22,17 +40,11 @@ import org.apache.lucene.index.Sorter.DocMap; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; - -import com.nvidia.cuvs.BruteForceIndex; -import com.nvidia.cuvs.BruteForceIndexParams; -import com.nvidia.cuvs.CagraIndex; -import com.nvidia.cuvs.CagraIndexParams; -import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; -import com.nvidia.cuvs.CuVSResources; +import org.apache.lucene.util.SuppressForbidden; public class CuVSVectorsWriter extends KnnVectorsWriter { - protected Logger log = Logger.getLogger(getClass().getName()); + // protected Logger log = Logger.getLogger(getClass().getName()); private List fieldVectorWriters = new ArrayList<>(); private IndexOutput cuVSIndex = null; @@ -41,7 +53,7 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { private CagraIndex cagraIndex; private CagraIndex cagraIndexForHnsw; - + private int cuvsWriterThreads; private int intGraphDegree; private int graphDegree; @@ -49,10 +61,17 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { private CuVSResources resources; public enum MergeStrategy { - TRIVIAL_MERGE, NON_TRIVIAL_MERGE + TRIVIAL_MERGE, + NON_TRIVIAL_MERGE }; - public CuVSVectorsWriter(SegmentWriteState state, int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy, CuVSResources resources) + public CuVSVectorsWriter( + SegmentWriteState state, + int cuvsWriterThreads, + int intGraphDegree, + int graphDegree, + MergeStrategy mergeStrategy, + CuVSResources resources) throws IOException { super(); this.segmentWriteState = state; @@ -62,7 +81,11 @@ public CuVSVectorsWriter(SegmentWriteState state, int cuvsWriterThreads, int int this.graphDegree = graphDegree; this.resources = resources; - cuVSDataFilename = IndexFileNames.segmentFileName(this.segmentWriteState.segmentInfo.name, this.segmentWriteState.segmentSuffix, CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + cuVSDataFilename = + IndexFileNames.segmentFileName( + this.segmentWriteState.segmentInfo.name, + this.segmentWriteState.segmentSuffix, + CuVSVectorsFormat.VECTOR_DATA_EXTENSION); } @Override @@ -85,58 +108,65 @@ public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException return cagraFieldVectorWriter; } + @SuppressForbidden(reason = "A temporary java.util.File is needed for Cagra's serialization") private byte[] createCagraIndex(float[][] vectors, List mapping) throws Throwable { - CagraIndexParams indexParams = new CagraIndexParams.Builder(resources) - .withNumWriterThreads(cuvsWriterThreads) - .withIntermediateGraphDegree(intGraphDegree) - .withGraphDegree(graphDegree) - .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) - .build(); - - log.info("Indexing started: " + System.currentTimeMillis()); - cagraIndex = new CagraIndex.Builder(resources) - .withDataset(vectors) - .withIndexParams(indexParams) - .build(); - log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + vectors.length); + CagraIndexParams indexParams = + new CagraIndexParams.Builder(resources) + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + + // log.info("Indexing started: " + System.currentTimeMillis()); + cagraIndex = + new CagraIndex.Builder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + + // vectors.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); - File tmpFile = File.createTempFile("tmpindex", "cag"); // TODO: Should we make this a file with random names? + File tmpFile = + File.createTempFile( + "tmpindex", "cag"); // TODO: Should we make this a file with random names? cagraIndex.serialize(baos, tmpFile); return baos.toByteArray(); } + @SuppressForbidden(reason = "A temporary java.util.File is needed for BruteForce's serialization") private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { - BruteForceIndexParams indexParams = new BruteForceIndexParams.Builder() - .withNumWriterThreads(32) // TODO: Make this configurable later. - .build(); - - log.info("Indexing started: " + System.currentTimeMillis()); - BruteForceIndex index = new BruteForceIndex.Builder(resources) - .withIndexParams(indexParams) - .withDataset(vectors) - .build(); - - log.info("Indexing done: " + System.currentTimeMillis()); + BruteForceIndexParams indexParams = + new BruteForceIndexParams.Builder() + .withNumWriterThreads(32) // TODO: Make this configurable later. + .build(); + + // log.info("Indexing started: " + System.currentTimeMillis()); + BruteForceIndex index = + new BruteForceIndex.Builder(resources) + .withIndexParams(indexParams) + .withDataset(vectors) + .build(); + + // log.info("Indexing done: " + System.currentTimeMillis()); ByteArrayOutputStream baos = new ByteArrayOutputStream(); index.serialize(baos); return baos.toByteArray(); } - + + @SuppressForbidden(reason = "A temporary java.util.File is needed for HNSW's serialization") private byte[] createHnswIndex(float[][] vectors) throws Throwable { - CagraIndexParams indexParams = new CagraIndexParams.Builder(resources) - .withNumWriterThreads(cuvsWriterThreads) - .withIntermediateGraphDegree(intGraphDegree) - .withGraphDegree(graphDegree) - .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) - .build(); - - log.info("Indexing started: " + System.currentTimeMillis()); - cagraIndexForHnsw = new CagraIndex.Builder(resources) - .withDataset(vectors) - .withIndexParams(indexParams) - .build(); - log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + vectors.length); + CagraIndexParams indexParams = + new CagraIndexParams.Builder(resources) + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + + // log.info("Indexing started: " + System.currentTimeMillis()); + cagraIndexForHnsw = + new CagraIndex.Builder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + + // vectors.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); File tmpFile = File.createTempFile("tmpindex", "hnsw"); @@ -147,64 +177,82 @@ private byte[] createHnswIndex(float[][] vectors) throws Throwable { @SuppressWarnings({"resource", "rawtypes", "unchecked"}) @Override public void flush(int maxDoc, DocMap sortMap) throws IOException { - cuVSIndex = this.segmentWriteState.directory.createOutput(cuVSDataFilename, this.segmentWriteState.context); - CodecUtil.writeIndexHeader(cuVSIndex, CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, CuVSVectorsFormat.VERSION_CURRENT, this.segmentWriteState.segmentInfo.getId(), this.segmentWriteState.segmentSuffix); - + cuVSIndex = + this.segmentWriteState.directory.createOutput( + cuVSDataFilename, this.segmentWriteState.context); + CodecUtil.writeIndexHeader( + cuVSIndex, + CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, + CuVSVectorsFormat.VERSION_CURRENT, + this.segmentWriteState.segmentInfo.getId(), + this.segmentWriteState.segmentSuffix); CuVSSegmentFile cuVSFile = new CuVSSegmentFile(new SegmentOutputStream(cuVSIndex, 100000)); LinkedHashMap metaMap = new LinkedHashMap(); for (CagraFieldVectorsWriter field : fieldVectorWriters) { - long start = System.currentTimeMillis(); + // long start = System.currentTimeMillis(); byte[] cagraIndexBytes = null; byte[] bruteForceIndexBytes = null; byte[] hnswIndexBytes = null; try { - log.info("Starting CAGRA indexing, space remaining: "+new File("/").getFreeSpace()); - log.info("Starting CAGRA indexing, docs: " + field.vectors.size()); - + // log.info("Starting CAGRA indexing, space remaining: " + new File("/").getFreeSpace()); + // log.info("Starting CAGRA indexing, docs: " + field.vectors.size()); + float vectors[][] = new float[field.vectors.size()][field.vectors.get(0).length]; for (int i = 0; i < vectors.length; i++) { for (int j = 0; j < vectors[i].length; j++) { vectors[i][j] = field.vectors.get(i)[j]; } } - - cagraIndexBytes = createCagraIndex(vectors, new ArrayList(field.vectors.keySet())); // nocommit + + cagraIndexBytes = createCagraIndex(vectors, new ArrayList(field.vectors.keySet())); bruteForceIndexBytes = createBruteForceIndex(vectors); hnswIndexBytes = createHnswIndex(vectors); } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } - - start = System.currentTimeMillis(); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".cag", cagraIndexBytes); - log.info("time for writing CAGRA index bytes to zip: " + (System.currentTimeMillis() - start)); - - start = System.currentTimeMillis(); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".bf", bruteForceIndexBytes); - log.info("time for writing BRUTEFORCE index bytes to zip: " + (System.currentTimeMillis() - start)); - - start = System.currentTimeMillis(); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".hnsw", hnswIndexBytes); - log.info("time for writing HNSW index bytes to zip: " + (System.currentTimeMillis() - start)); - - start = System.currentTimeMillis(); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".vec", SerializationUtils.serialize(new ArrayList(field.vectors.values()))); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".map", SerializationUtils.serialize(new ArrayList(field.vectors.keySet()))); - log.info("list serializing and writing: " + (System.currentTimeMillis() - start)); + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".cag", cagraIndexBytes); + // log.info( + // "time for writing CAGRA index bytes to zip: " + (System.currentTimeMillis() - start)); + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".bf", bruteForceIndexBytes); + /*log.info( + "time for writing BRUTEFORCE index bytes to zip: " + + (System.currentTimeMillis() - start));*/ + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".hnsw", hnswIndexBytes); + // log.info("time for writing HNSW index bytes to zip: " + (System.currentTimeMillis() - + // start)); + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".vec", + SerializationUtils.serialize(new ArrayList(field.vectors.values()))); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".map", + SerializationUtils.serialize(new ArrayList(field.vectors.keySet()))); + // log.info("list serializing and writing: " + (System.currentTimeMillis() - start)); field.vectors.clear(); } metaMap.put(segmentWriteState.segmentInfo.name, maxDoc); - cuVSFile.addFile(segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); cuVSFile.close(); - + CodecUtil.writeFooter(cuVSIndex); } - + SegmentOutputStream mergeOutputStream = null; CuVSSegmentFile mergedIndexFile = null; @@ -220,43 +268,50 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE readers.add(reader); } - log.info("Merging one field for segment: " + segmentWriteState.segmentInfo.name); - log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + // log.info("Merging one field for segment: " + segmentWriteState.segmentInfo.name); + // log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); if (!List.of(segmentWriteState.directory.listAll()).contains(cuVSDataFilename)) { - IndexOutput mergedVectorIndex = segmentWriteState.directory.createOutput(cuVSDataFilename, segmentWriteState.context); - CodecUtil.writeIndexHeader(mergedVectorIndex, CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, - CuVSVectorsFormat.VERSION_CURRENT, segmentWriteState.segmentInfo.getId(), segmentWriteState.segmentSuffix); - this.mergeOutputStream = new SegmentOutputStream(mergedVectorIndex, 100000); + IndexOutput mergedVectorIndex = + segmentWriteState.directory.createOutput(cuVSDataFilename, segmentWriteState.context); + CodecUtil.writeIndexHeader( + mergedVectorIndex, + CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, + CuVSVectorsFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + this.mergeOutputStream = new SegmentOutputStream(mergedVectorIndex, 100000); mergedIndexFile = new CuVSSegmentFile(this.mergeOutputStream); } - - log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + + // log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); if (mergeStrategy.equals(MergeStrategy.TRIVIAL_MERGE)) { - Util.getMergedArchiveCOS(segInputStreams, segmentWriteState.segmentInfo.name, this.mergeOutputStream - ); + throw new UnsupportedOperationException(); } else if (mergeStrategy.equals(MergeStrategy.NON_TRIVIAL_MERGE)) { - // nocommit: this doesn't merge all the fields - log.info("Readers: "+segInputStreams.size()+", deocMaps: "+mergeState.docMaps.length); + // log.info("Readers: " + segInputStreams.size() + ", deocMaps: " + + // mergeState.docMaps.length); ArrayList docMapList = new ArrayList(); for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { - CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; - for (CuVSIndex index: reader.cuvsIndexes.get(fieldInfo.name)) { - log.info("Mapping for segment ("+reader.fileName+"): " + index.getMapping()); - log.info("Mapping for segment ("+reader.fileName+"): " + index.getMapping().size()); - for (int id=0; id mergedVectors = Util.getMergedVectors(segInputStreams, fieldInfo.name, segmentWriteState.segmentInfo.name); - log.info("Final mapping: " + docMapList); - log.info("Final mapping: " + docMapList.size()); - log.info("Merged vectors: " + mergedVectors.size()); + + ArrayList mergedVectors = + Util.getMergedVectors( + segInputStreams, fieldInfo.name, segmentWriteState.segmentInfo.name); + // log.info("Final mapping: " + docMapList); + // log.info("Final mapping: " + docMapList.size()); + // log.info("Merged vectors: " + mergedVectors.size()); LinkedHashMap metaMap = new LinkedHashMap(); byte[] cagraIndexBytes = null; byte[] bruteForceIndexBytes = null; @@ -272,27 +327,36 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE bruteForceIndexBytes = createBruteForceIndex(vectors); hnswIndexBytes = createHnswIndex(vectors); } catch (Throwable e) { - e.printStackTrace(); + throw new RuntimeException(e); } - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".cag", cagraIndexBytes); - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".bf", bruteForceIndexBytes); - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".hnsw", hnswIndexBytes); - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".vec", SerializationUtils.serialize(mergedVectors)); - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".map", SerializationUtils.serialize(docMapList)); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".cag", cagraIndexBytes); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".bf", + bruteForceIndexBytes); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".hnsw", hnswIndexBytes); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".vec", + SerializationUtils.serialize(mergedVectors)); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".map", + SerializationUtils.serialize(docMapList)); metaMap.put(segmentWriteState.segmentInfo.name, mergedVectors.size()); - if (mergedIndexFile.getFilesAdded().contains(segmentWriteState.segmentInfo.name + ".meta") == false) { - mergedIndexFile.addFile(segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); + if (mergedIndexFile.getFilesAdded().contains(segmentWriteState.segmentInfo.name + ".meta") + == false) { + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); } - log.info("DocMaps: "+Arrays.toString(mergeState.docMaps)); + // log.info("DocMaps: " + Arrays.toString(mergeState.docMaps)); metaMap.clear(); } } - @Override public void finish() throws IOException { - if (this.mergeOutputStream!=null) { + if (this.mergeOutputStream != null) { mergedIndexFile.close(); CodecUtil.writeFooter(mergeOutputStream.out); IOUtils.close(mergeOutputStream.out); @@ -334,6 +398,5 @@ public void flush() throws IOException { public void close() throws IOException { this.flush(); } - } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java index d4d19fad7041..3c96aa37325b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -1,8 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import java.util.ArrayList; import java.util.List; - import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; @@ -55,7 +70,7 @@ public int k() { @Override @SuppressWarnings("cast") public boolean collect(int docId, float similarity) { - scoreDocs.add(new ScoreDoc(docId, 1f/(float)(similarity))); + scoreDocs.add(new ScoreDoc(docId, 1f / (float) (similarity))); return true; } @@ -67,8 +82,8 @@ public float minCompetitiveSimilarity() { @Override public TopDocs topDocs() { - return new TopDocs(new TotalHits(scoreDocs.size(), TotalHits.Relation.EQUAL_TO), + return new TopDocs( + new TotalHits(scoreDocs.size(), TotalHits.Relation.EQUAL_TO), scoreDocs.toArray(new ScoreDoc[scoreDocs.size()])); } - } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java index a352269fbb1b..787d7c81cc61 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -1,22 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import java.io.IOException; import java.io.InputStream; - import org.apache.lucene.store.IndexInput; public class SegmentInputStream extends InputStream { - /** - * - */ + /** */ private final IndexInput indexInput; + public final long initialFilePointerPosition; public final long limit; public long pos = 0; // TODO: This input stream needs to be modified to enable buffering. - public SegmentInputStream(IndexInput indexInput, long limit, long initialFilePointerPosition) throws IOException { + public SegmentInputStream(IndexInput indexInput, long limit, long initialFilePointerPosition) + throws IOException { super(); this.indexInput = indexInput; this.initialFilePointerPosition = initialFilePointerPosition; @@ -86,5 +101,4 @@ public void close() { public int available() { throw new UnsupportedOperationException(); } - -} \ No newline at end of file +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java index a8200e7b897b..1ffb75037609 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -1,33 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.OutputStream; -import java.lang.invoke.MethodHandles; import java.util.ArrayList; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; -import java.util.logging.Logger; -import java.util.zip.Deflater; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import java.util.zip.ZipOutputStream; +import org.apache.commons.lang3.SerializationUtils; public class Util { - public static ByteArrayOutputStream getZipEntryBAOS(String fileName, SegmentInputStream segInputStream) - throws IOException { + public static ByteArrayOutputStream getZipEntryBAOS( + String fileName, SegmentInputStream segInputStream) throws IOException { segInputStream.reset(); ZipInputStream zipInputStream = new ZipInputStream(segInputStream); ByteArrayOutputStream baos = new ByteArrayOutputStream(); boolean fileFound = false; ZipEntry zipEntry; - while (zipInputStream.available() == 1 && ((zipEntry = zipInputStream.getNextEntry()) != null)) { + while (zipInputStream.available() == 1 + && ((zipEntry = zipInputStream.getNextEntry()) != null)) { if (zipEntry.getName().equals(fileName)) { fileFound = true; byte[] buffer = new byte[1024]; @@ -41,18 +49,19 @@ public static ByteArrayOutputStream getZipEntryBAOS(String fileName, SegmentInpu return baos; } - private static final Logger log = Logger.getLogger(Util.class.getName()); + // private static final Logger log = Logger.getLogger(Util.class.getName()); - public static ArrayList getMergedVectors(List segInputStreams, String fieldName, String mergedSegmentName) + public static ArrayList getMergedVectors( + List segInputStreams, String fieldName, String mergedSegmentName) throws IOException { ZipEntry zs; ArrayList mergedVectors = new ArrayList(); - log.info("Getting mergedVectors..."); + // log.info("Getting mergedVectors..."); for (SegmentInputStream segInputStream : segInputStreams) { segInputStream.reset(); ZipInputStream zipStream = new ZipInputStream(segInputStream); while ((zs = zipStream.getNextEntry()) != null) { - log.info("Getting mergedVectors... " + zs.getName()); + // log.info("Getting mergedVectors... " + zs.getName()); byte[] buffer = new byte[1024]; int length; if (zs.getName().endsWith(".vec")) { @@ -62,7 +71,7 @@ public static ArrayList getMergedVectors(List segIn while ((length = zipStream.read(buffer)) != -1) { baosM.write(buffer, 0, length); } - List m = deSerializeListInMemory(baosM.toByteArray()); + List m = SerializationUtils.deserialize(baosM.toByteArray()); mergedVectors.addAll(m); } } @@ -70,73 +79,4 @@ public static ArrayList getMergedVectors(List segIn } return mergedVectors; } - - public static void getMergedArchiveCOS(List segInputStreams, String mergedSegmentName, - OutputStream os) throws IOException { - ZipOutputStream zos = new ZipOutputStream(os); - ZipEntry zs; - Map mergedMetaMap = new LinkedHashMap(); - for (SegmentInputStream segInputStream : segInputStreams) { - segInputStream.reset(); - ZipInputStream zipStream = new ZipInputStream(segInputStream); - while ((zs = zipStream.getNextEntry()) != null) { - byte[] buffer = new byte[1024]; - int length; - if (zs.getName().endsWith(".meta")) { - ByteArrayOutputStream baosM = new ByteArrayOutputStream(); - while ((length = zipStream.read(buffer)) != -1) { - baosM.write(buffer, 0, length); - } - Map m = deSerializeMapInMemory(baosM.toByteArray()); - mergedMetaMap.putAll(m); - } else { - ZipEntry zipEntry = new ZipEntry(zs.getName()); - zos.putNextEntry(zipEntry); - zos.setLevel(Deflater.NO_COMPRESSION); - while ((length = zipStream.read(buffer)) != -1) { - zos.write(buffer, 0, length); - } - zos.closeEntry(); - } - } - } - // Finally put the merged meta file - ZipEntry mergedMetaZipEntry = new ZipEntry(mergedSegmentName + ".meta"); - zos.putNextEntry(mergedMetaZipEntry); - zos.setLevel(Deflater.NO_COMPRESSION); - new ObjectOutputStream(zos).writeObject(mergedMetaMap); // Java serialization should be avoided - zos.closeEntry(); - zos.close(); - } - - @SuppressWarnings("unchecked") - public static Map deSerializeMapInMemory(byte[] bytes) { - Map map = null; - ObjectInputStream ois = null; - try { - ois = new ObjectInputStream(new ByteArrayInputStream(bytes)); - map = (Map) ois.readObject(); - ois.close(); - } catch (Exception e) { - e.printStackTrace(); - } - - return map; - } - - @SuppressWarnings("unchecked") - public static List deSerializeListInMemory(byte[] bytes) { - List map = null; - ObjectInputStream ois = null; - try { - ois = new ObjectInputStream(new ByteArrayInputStream(bytes)); - map = (List) ois.readObject(); - ois.close(); - } catch (Exception e) { - e.printStackTrace(); - } - - return map; - } - } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java index 67199edca2f6..ce9cd8cc52d2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -1 +1,17 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 38b31884377d..6f0a89e365d1 100644 --- a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -1 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java index 15a023d6fbd3..70325a3aa294 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -1,14 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import java.io.IOException; -import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Random; import java.util.TreeMap; - +import java.util.logging.Logger; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -30,13 +45,11 @@ import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; @SuppressSysoutChecks(bugUrl = "prints info from within cuvs") public class TestCuVS extends LuceneTestCase { - private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + protected static Logger log = Logger.getLogger(TestCuVS.class.getName()); private static IndexSearcher searcher; private static IndexReader reader; @@ -45,16 +58,16 @@ public class TestCuVS extends LuceneTestCase { public static int DATASET_SIZE_LIMIT = 1000; public static int DIMENSIONS_LIMIT = 2048; public static int NUM_QUERIES_LIMIT = 10; - public static int TOP_K_LIMIT = 64; // nocommit This fails beyond 64 + public static int TOP_K_LIMIT = 64; // TODO This fails beyond 64 public static float[][] dataset = null; @BeforeClass public static void beforeClass() throws Exception { directory = newDirectory(); - + Codec codec = new CuVSCodec(); - + RandomIndexWriter writer = new RandomIndexWriter( random(), @@ -63,7 +76,7 @@ public static void beforeClass() throws Exception { .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) .setCodec(codec) .setMergePolicy(newTieredMergePolicy())); - + log.info("Merge Policy: " + writer.w.getConfig().getMergePolicy()); Random random = random(); @@ -74,8 +87,10 @@ public static void beforeClass() throws Exception { Document doc = new Document(); doc.add(new StringField("id", String.valueOf(i), Field.Store.YES)); doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES)); - boolean skipVector = random.nextInt(10) < 0; // nocommit disable testing with holes for now, there's some bug. - if (!skipVector || datasetSize<100) { // about 10th of the documents shouldn't have a single vector + boolean skipVector = + random.nextInt(10) < 0; // disable testing with holes for now, there's some bug. + if (!skipVector + || datasetSize < 100) { // about 10th of the documents shouldn't have a single vector doc.add(new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); doc.add(new KnnFloatVectorField("vector2", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); } @@ -90,7 +105,6 @@ public static void beforeClass() throws Exception { @AfterClass public static void afterClass() throws Exception { - // nocommit This fails until flat vectors are implemented reader.close(); directory.close(); searcher = null; @@ -105,46 +119,30 @@ public void testVectorSearch() throws IOException { int numQueries = random.nextInt(NUM_QUERIES_LIMIT) + 1; int topK = Math.min(random.nextInt(TOP_K_LIMIT) + 1, dataset.length); - if(dataset.length < topK) topK = dataset.length; + if (dataset.length < topK) topK = dataset.length; float[][] queries = generateQueries(random, dataset[0].length, numQueries); List> expected = generateExpectedResults(topK, dataset, queries); - - debugPrintDatasetAndQueries(dataset, queries); - log.info("Dataset size: {}x{}", dataset.length, dataset[0].length); - log.info("Query size: {}x{}", numQueries, queries[0].length); - log.info("TopK: {}", topK); + log.info("Dataset size: " + dataset.length + "x" + dataset[0].length); + log.info("Query size: " + numQueries + "x" + queries[0].length); + log.info("TopK: " + topK); Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); int correct[] = new int[topK]; - for (int i=0; i> generateExpectedResults(int topK, float[][] dataset, float[][] queries) { + + private static List> generateExpectedResults( + int topK, float[][] dataset, float[][] queries) { List> neighborsResult = new ArrayList<>(); int dimensions = dataset[0].length; @@ -186,13 +185,19 @@ private static List> generateExpectedResults(int topK, float[][] d Map sorted = new TreeMap(distances); log.info("EXPECTED: " + sorted); - + // Sort by distance and select the topK nearest neighbors - List neighbors = distances.entrySet().stream() - .sorted(Map.Entry.comparingByValue()) - .map(Map.Entry::getKey) - .toList(); - neighborsResult.add(neighbors.subList(0, Math.min(topK * 3, dataset.length))); // generate double the topK results in the expected array + List neighbors = + distances.entrySet().stream() + .sorted(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey) + .toList(); + neighborsResult.add( + neighbors.subList( + 0, + Math.min( + topK * 3, + dataset.length))); // generate double the topK results in the expected array } log.info("Expected results generated successfully."); diff --git a/versions.lock b/versions.lock index 26de44f99e2d..dfa465a1b3fe 100644 --- a/versions.lock +++ b/versions.lock @@ -4,6 +4,7 @@ "main_dependencies" : { "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1" : "fa9ef26b,refs=4", "com.ibm.icu:icu4j:74.2" : "47ea4550,refs=6", + "com.nvidia.cuvs:cuvs-java:25.02" : "0129b4f0,refs=6", "commons-codec:commons-codec:1.13" : "e9962aab,refs=4", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.1" : "fa9ef26b,refs=4", @@ -11,6 +12,7 @@ "net.sourceforge.nekohtml:nekohtml:1.9.17" : "5ce8cdc6,refs=2", "org.antlr:antlr4-runtime:4.11.1" : "d9953130,refs=4", "org.apache.commons:commons-compress:1.19" : "5ce8cdc6,refs=2", + "org.apache.commons:commons-lang3:3.17.0" : "0129b4f0,refs=6", "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", "org.apache.opennlp:opennlp-tools:2.3.2" : "2f760bab,refs=4", "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", @@ -46,6 +48,7 @@ "com.google.j2objc:j2objc-annotations:1.3" : "6897bc09,refs=38", "com.google.protobuf:protobuf-java:3.19.2" : "6897bc09,refs=38", "com.ibm.icu:icu4j:74.2" : "ffa00415,refs=8", + "com.nvidia.cuvs:cuvs-java:25.02" : "7ac6f8d9,refs=9", "commons-codec:commons-codec:1.13" : "733734f0,refs=6", "io.github.java-diff-utils:java-diff-utils:4.0" : "6897bc09,refs=38", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", @@ -55,6 +58,7 @@ "net.sourceforge.nekohtml:nekohtml:1.9.17" : "6f16ff86,refs=2", "org.antlr:antlr4-runtime:4.11.1" : "6fbc4021,refs=5", "org.apache.commons:commons-compress:1.19" : "6f16ff86,refs=2", + "org.apache.commons:commons-lang3:3.17.0" : "7ac6f8d9,refs=9", "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", "org.apache.opennlp:opennlp-tools:2.3.2" : "b91715f0,refs=6", "org.assertj:assertj-core:3.21.0" : "b7ba1646,refs=2", @@ -79,6 +83,32 @@ } }, "because" : { + "0129b4f0" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "152d9f78" : [ { "configuration" : "annotationProcessor", @@ -405,6 +435,44 @@ "projectPath" : ":lucene:analysis:morfologik" } ], + "7ac6f8d9" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "85a1e4c6" : [ { "configuration" : "compileClasspath", diff --git a/versions.toml b/versions.toml index 327848fd10d4..d0db5fd20d9d 100644 --- a/versions.toml +++ b/versions.toml @@ -35,7 +35,6 @@ s2-geometry = "1.0.0" spatial4j = "0.8" xerces = "2.12.0" zstd = "1.5.5-11" -jackson-core = "2.18.2" [libraries] antlr-core = { module = "org.antlr:antlr4", version.ref = "antlr" } @@ -57,7 +56,6 @@ flexmark-ext-tables = { module = "com.vladsch.flexmark:flexmark-ext-tables", ver groovy = { module = "org.apache.groovy:groovy-all", version.ref = "groovy" } hamcrest = { module = "org.hamcrest:hamcrest", version.ref = "hamcrest" } icu4j = { module = "com.ibm.icu:icu4j", version.ref = "icu4j" } -jackson-core = { module = "com.fasterxml.jackson.core:jackson-core", version.ref = "jackson-core" } javacc = { module = "net.java.dev.javacc:javacc", version.ref = "javacc" } jflex = { module = "de.jflex:jflex", version.ref = "jflex" } jgit = { module = "org.eclipse.jgit:org.eclipse.jgit", version.ref = "jgit" } From 5851f44952095e77c84c7f0d5941777cfcaad34c Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 14 Jan 2025 12:33:20 +0100 Subject: [PATCH 19/88] Remove `acceptDocs` argument from `DocIdSetIterator#intoBitSet` and introduce `Bits#applyMask`. (#14134) Most `DocIdSetIterator` implementations can no longer implement `#intoBitSet` efficiently as soon as there are live docs. So this commit remove this argument and instead introduces a new `Bits#applyMask` API that helps clear bits in a bit set when the corresponding doc ID is not live. Relates #14133 --- lucene/CHANGES.txt | 3 + .../lucene101/Lucene101PostingsReader.java | 21 +-- .../apache/lucene/search/BooleanScorer.java | 61 ++++--- .../search/DenseConjunctionBulkScorer.java | 10 +- .../search/DisjunctionDISIApproximation.java | 8 +- .../lucene/search/DocIdSetIterator.java | 12 +- .../apache/lucene/util/BitSetIterator.java | 17 +- .../src/java/org/apache/lucene/util/Bits.java | 28 +++ .../org/apache/lucene/util/FixedBitSet.java | 161 ++++++++++++++++-- .../apache/lucene/util/TestFixedBitSet.java | 92 ++++++---- .../asserting/AssertingLiveDocsFormat.java | 7 + .../tests/index/RandomPostingsTester.java | 10 +- .../lucene/tests/search/AssertingScorer.java | 6 +- 13 files changed, 299 insertions(+), 137 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4c1aac688497..31ddd83d907e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -41,6 +41,9 @@ API Changes * GITHUB#14069: Added DocIdSetIterator#intoBitSet API to let implementations optimize loading doc IDs into a bit set. (Adrien Grand) +* GITHUB#14134: Added Bits#applyMask API to help apply live docs as a mask on a + bit set of matches. (Adrien Grand) + New Features --------------------- (No changes) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java index 6cd16bb7cc36..68df7683b28c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java @@ -53,7 +53,6 @@ import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; @@ -878,16 +877,13 @@ public int advance(int target) throws IOException { } @Override - public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset) - throws IOException { + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { if (doc >= upTo) { return; } // Handle the current doc separately, it may be on the previous docBuffer. - if (acceptDocs == null || acceptDocs.get(doc)) { - bitSet.set(doc - offset); - } + bitSet.set(doc - offset); for (; ; ) { if (docBufferUpto == BLOCK_SIZE) { @@ -898,7 +894,7 @@ public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset int start = docBufferUpto; int end = computeBufferEndBoundary(upTo); if (end != 0) { - bufferIntoBitSet(start, end, acceptDocs, bitSet, offset); + bufferIntoBitSet(start, end, bitSet, offset); doc = docBuffer[end - 1]; } docBufferUpto = end; @@ -922,15 +918,12 @@ private int computeBufferEndBoundary(int upTo) { } } - private void bufferIntoBitSet( - int start, int end, Bits acceptDocs, FixedBitSet bitSet, int offset) throws IOException { - // acceptDocs#get (if backed by FixedBitSet), bitSet#set and `doc - offset` get - // auto-vectorized + private void bufferIntoBitSet(int start, int end, FixedBitSet bitSet, int offset) + throws IOException { + // bitSet#set and `doc - offset` get auto-vectorized for (int i = start; i < end; ++i) { int doc = docBuffer[i]; - if (acceptDocs == null || acceptDocs.get(doc)) { - bitSet.set(doc - offset); - } + bitSet.set(doc - offset); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java index 5d7dfaf8b832..a6599a57fd25 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java @@ -164,37 +164,6 @@ public long cost() { return cost; } - private void scoreDisiWrapperIntoBitSet(DisiWrapper w, Bits acceptDocs, int min, int max) - throws IOException { - boolean needsScores = BooleanScorer.this.needsScores; - FixedBitSet matching = BooleanScorer.this.matching; - Bucket[] buckets = BooleanScorer.this.buckets; - - DocIdSetIterator it = w.iterator; - Scorable scorer = w.scorable; - int doc = w.doc; - if (doc < min) { - doc = it.advance(min); - } - if (buckets == null) { - it.intoBitSet(acceptDocs, max, matching, doc & ~MASK); - } else { - for (; doc < max; doc = it.nextDoc()) { - if (acceptDocs == null || acceptDocs.get(doc)) { - final int i = doc & MASK; - matching.set(i); - final Bucket bucket = buckets[i]; - bucket.freq++; - if (needsScores) { - bucket.score += scorer.score(); - } - } - } - } - - w.doc = it.docID(); - } - private void scoreWindowIntoBitSetAndReplay( LeafCollector collector, Bits acceptDocs, @@ -207,7 +176,35 @@ private void scoreWindowIntoBitSetAndReplay( for (int i = 0; i < numScorers; ++i) { final DisiWrapper w = scorers[i]; assert w.doc < max; - scoreDisiWrapperIntoBitSet(w, acceptDocs, min, max); + + DocIdSetIterator it = w.iterator; + int doc = w.doc; + if (doc < min) { + doc = it.advance(min); + } + if (buckets == null) { + // This doesn't apply live docs, so we'll need to apply them later + it.intoBitSet(max, matching, base); + } else { + for (; doc < max; doc = it.nextDoc()) { + if (acceptDocs == null || acceptDocs.get(doc)) { + final int d = doc & MASK; + matching.set(d); + final Bucket bucket = buckets[d]; + bucket.freq++; + if (needsScores) { + bucket.score += w.scorable.score(); + } + } + } + } + + w.doc = it.docID(); + } + + if (buckets == null && acceptDocs != null) { + // In this case, live docs have not been applied yet. + acceptDocs.applyMask(matching, base); } docIdStreamView.base = base; diff --git a/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java index 2acf04ba501b..121687245248 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DenseConjunctionBulkScorer.java @@ -105,7 +105,11 @@ private void scoreWindowUsingBitSet( assert clauseWindowMatches.scanIsEmpty(); int offset = lead.docID(); - lead.intoBitSet(acceptDocs, max, windowMatches, offset); + lead.intoBitSet(max, windowMatches, offset); + if (acceptDocs != null) { + // Apply live docs. + acceptDocs.applyMask(windowMatches, offset); + } int upTo = 0; for (; @@ -116,9 +120,7 @@ private void scoreWindowUsingBitSet( if (other.docID() < offset) { other.advance(offset); } - // No need to apply acceptDocs on other clauses since we already applied live docs on the - // leading clause. - other.intoBitSet(null, max, clauseWindowMatches, offset); + other.intoBitSet(max, clauseWindowMatches, offset); windowMatches.and(clauseWindowMatches); clauseWindowMatches.clear(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java index 6ab57c7b180c..cedababbce6b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java @@ -21,7 +21,6 @@ import java.util.Collection; import java.util.Comparator; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; /** @@ -150,17 +149,16 @@ public int advance(int target) throws IOException { } @Override - public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset) - throws IOException { + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { while (leadTop.doc < upTo) { - leadTop.approximation.intoBitSet(acceptDocs, upTo, bitSet, offset); + leadTop.approximation.intoBitSet(upTo, bitSet, offset); leadTop.doc = leadTop.approximation.docID(); leadTop = leadIterators.updateTop(); } minOtherDoc = Integer.MAX_VALUE; for (DisiWrapper w : otherIterators) { - w.approximation.intoBitSet(acceptDocs, upTo, bitSet, offset); + w.approximation.intoBitSet(upTo, bitSet, offset); w.doc = w.approximation.docID(); minOtherDoc = Math.min(minOtherDoc, w.doc); } diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java index ee30f627a56b..e0bee1da2314 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java @@ -17,7 +17,6 @@ package org.apache.lucene.search; import java.io.IOException; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; /** @@ -220,9 +219,7 @@ protected final int slowAdvance(int target) throws IOException { * *

        * for (int doc = docID(); doc < upTo; doc = nextDoc()) {
    -   *   if (acceptDocs == null || acceptDocs.get(doc)) {
    -   *     bitSet.set(doc - offset);
    -   *   }
    +   *   bitSet.set(doc - offset);
        * }
        * 
    * @@ -233,13 +230,10 @@ protected final int slowAdvance(int target) throws IOException { * * @lucene.internal */ - public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset) - throws IOException { + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { assert offset <= docID(); for (int doc = docID(); doc < upTo; doc = nextDoc()) { - if (acceptDocs == null || acceptDocs.get(doc)) { - bitSet.set(doc - offset); - } + bitSet.set(doc - offset); } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java b/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java index 4d7c83057cbe..ff74c107b13c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java @@ -99,20 +99,13 @@ public long cost() { } @Override - public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset) - throws IOException { - // TODO: Can we also optimize the case when acceptDocs is not null? - if (acceptDocs == null - && offset < bits.length() - && bits instanceof FixedBitSet fixedBits - // no bits are set between `offset` and `doc` - && fixedBits.nextSetBit(offset) == doc - // the whole `bitSet` is getting filled - && (upTo - offset == bitSet.length())) { - bitSet.orRange(fixedBits, offset); + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + upTo = Math.min(upTo, bits.length()); + if (upTo > doc && bits instanceof FixedBitSet fixedBits) { + FixedBitSet.orRange(fixedBits, doc, bitSet, doc - offset, upTo - doc); advance(upTo); // set the current doc } else { - super.intoBitSet(acceptDocs, upTo, bitSet, offset); + super.intoBitSet(upTo, bitSet, offset); } } } diff --git a/lucene/core/src/java/org/apache/lucene/util/Bits.java b/lucene/core/src/java/org/apache/lucene/util/Bits.java index dd42ad4b1973..61757a1a34e4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/Bits.java +++ b/lucene/core/src/java/org/apache/lucene/util/Bits.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.util; +import org.apache.lucene.search.DocIdSetIterator; + /** * Interface for Bitset-like structures. * @@ -34,6 +36,32 @@ public interface Bits { /** Returns the number of bits in this set */ int length(); + /** + * Apply this {@code Bits} instance to the given {@link FixedBitSet}, which starts at the given + * {@code offset}. + * + *

    This should behave the same way as the default implementation, which does the following: + * + *

    +   * for (int i = bitSet.nextSetBit(0);
    +   *     i != DocIdSetIterator.NO_MORE_DOCS;
    +   *     i = i + 1 >= bitSet.length() ? DocIdSetIterator.NO_MORE_DOCS : bitSet.nextSetBit(i + 1)) {
    +   *   if (get(offset + i) == false) {
    +   *     bitSet.clear(i);
    +   *   }
    +   * }
    +   * 
    + */ + default void applyMask(FixedBitSet bitSet, int offset) { + for (int i = bitSet.nextSetBit(0); + i != DocIdSetIterator.NO_MORE_DOCS; + i = i + 1 >= bitSet.length() ? DocIdSetIterator.NO_MORE_DOCS : bitSet.nextSetBit(i + 1)) { + if (get(offset + i) == false) { + bitSet.clear(i); + } + } + } + Bits[] EMPTY_ARRAY = new Bits[0]; /** Bits impl of the specified length with all bits set. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java index 584f30b3baac..1b6954d2eb66 100644 --- a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Objects; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; @@ -346,7 +347,7 @@ public void or(DocIdSetIterator iter) throws IOException { } else { checkUnpositioned(iter); iter.nextDoc(); - iter.intoBitSet(null, DocIdSetIterator.NO_MORE_DOCS, this, 0); + iter.intoBitSet(DocIdSetIterator.NO_MORE_DOCS, this, 0); } } @@ -364,40 +365,150 @@ private void or(final int otherOffsetWords, final long[] otherArr, final int oth } } + /** Read {@code numBits} (between 1 and 63) bits from {@code bitSet} at {@code from}. */ + private static long readNBits(long[] bitSet, int from, int numBits) { + assert numBits > 0 && numBits < Long.SIZE; + long bits = bitSet[from >> 6] >>> from; + int numBitsSoFar = Long.SIZE - (from & 0x3F); + if (numBitsSoFar < numBits) { + bits |= bitSet[(from >> 6) + 1] << -from; + } + return bits & ((1L << numBits) - 1); + } + /** - * Or {@code min(length(), other.length() - from} bits starting at {@code from} from {@code other} - * into this bit set starting at 0. + * Or {@code length} bits starting at {@code sourceFrom} from {@code source} into {@code dest} + * starting at {@code destFrom}. */ - void orRange(FixedBitSet other, int from) { - int numBits = Math.min(length(), other.length() - from); - if (numBits <= 0) { + public static void orRange( + FixedBitSet source, int sourceFrom, FixedBitSet dest, int destFrom, int length) { + assert length >= 0; + Objects.checkFromIndexSize(sourceFrom, length, source.length()); + Objects.checkFromIndexSize(destFrom, length, dest.length()); + + if (length == 0) { + return; + } + + long[] sourceBits = source.getBits(); + long[] destBits = dest.getBits(); + + // First, align `destFrom` with a word start, ie. a multiple of Long.SIZE (64) + if ((destFrom & 0x3F) != 0) { + int numBitsNeeded = Math.min(-destFrom & 0x3F, length); + long bits = readNBits(sourceBits, sourceFrom, numBitsNeeded) << destFrom; + destBits[destFrom >> 6] |= bits; + + sourceFrom += numBitsNeeded; + destFrom += numBitsNeeded; + length -= numBitsNeeded; + } + + if (length == 0) { return; } - int numFullWords = numBits >> 6; - long[] otherBits = other.getBits(); - int wordOffset = from >> 6; - if ((from & 0x3F) == 0) { - // from is aligned with a long[] + + assert (destFrom & 0x3F) == 0; + + // Now OR at the word level + int numFullWords = length >> 6; + int sourceWordFrom = sourceFrom >> 6; + int destWordFrom = destFrom >> 6; + + // Note: these two for loops auto-vectorize + if ((sourceFrom & 0x3F) == 0) { + // sourceFrom and destFrom are both aligned with a long[] for (int i = 0; i < numFullWords; ++i) { - bits[i] |= otherBits[wordOffset + i]; + destBits[destWordFrom + i] |= sourceBits[sourceWordFrom + i]; } } else { for (int i = 0; i < numFullWords; ++i) { - bits[i] |= (otherBits[wordOffset + i] >>> from) | (otherBits[wordOffset + i + 1] << -from); + destBits[destWordFrom + i] |= + (sourceBits[sourceWordFrom + i] >>> sourceFrom) + | (sourceBits[sourceWordFrom + i + 1] << -sourceFrom); } } - // Handle the remainder - for (int i = numFullWords << 6; i < numBits; ++i) { - if (other.get(from + i)) { - set(i); + sourceFrom += numFullWords << 6; + destFrom += numFullWords << 6; + length -= numFullWords << 6; + + // Finally handle tail bits + if (length > 0) { + long bits = readNBits(sourceBits, sourceFrom, length); + destBits[destFrom >> 6] |= bits; + } + } + + /** + * And {@code length} bits starting at {@code sourceFrom} from {@code source} into {@code dest} + * starting at {@code destFrom}. + */ + public static void andRange( + FixedBitSet source, int sourceFrom, FixedBitSet dest, int destFrom, int length) { + assert length >= 0 : length; + Objects.checkFromIndexSize(sourceFrom, length, source.length()); + Objects.checkFromIndexSize(destFrom, length, dest.length()); + + if (length == 0) { + return; + } + + long[] sourceBits = source.getBits(); + long[] destBits = dest.getBits(); + + // First, align `destFrom` with a word start, ie. a multiple of Long.SIZE (64) + if ((destFrom & 0x3F) != 0) { + int numBitsNeeded = Math.min(-destFrom & 0x3F, length); + long bits = readNBits(sourceBits, sourceFrom, numBitsNeeded) << destFrom; + bits |= ~(((1L << numBitsNeeded) - 1) << destFrom); + destBits[destFrom >> 6] &= bits; + + sourceFrom += numBitsNeeded; + destFrom += numBitsNeeded; + length -= numBitsNeeded; + } + + if (length == 0) { + return; + } + + assert (destFrom & 0x3F) == 0; + + // Now AND at the word level + int numFullWords = length >> 6; + int sourceWordFrom = sourceFrom >> 6; + int destWordFrom = destFrom >> 6; + + // Note: these two for loops auto-vectorize + if ((sourceFrom & 0x3F) == 0) { + // sourceFrom and destFrom are both aligned with a long[] + for (int i = 0; i < numFullWords; ++i) { + destBits[destWordFrom + i] &= sourceBits[sourceWordFrom + i]; } + } else { + for (int i = 0; i < numFullWords; ++i) { + destBits[destWordFrom + i] &= + (sourceBits[sourceWordFrom + i] >>> sourceFrom) + | (sourceBits[sourceWordFrom + i + 1] << -sourceFrom); + } + } + + sourceFrom += numFullWords << 6; + destFrom += numFullWords << 6; + length -= numFullWords << 6; + + // Finally handle tail bits + if (length > 0) { + long bits = readNBits(sourceBits, sourceFrom, length); + bits |= (~0L << length); + destBits[destFrom >> 6] &= bits; } } /** this = this OR other */ public void or(FixedBitSet other) { - orRange(other, 0); + orRange(other, 0, this, 0, other.length()); } /** this = this XOR other */ @@ -687,4 +798,18 @@ public static FixedBitSet copyOf(Bits bits) { public Bits asReadOnlyBits() { return new FixedBits(bits, numBits); } + + @Override + public void applyMask(FixedBitSet bitSet, int offset) { + // Note: Some scorers don't track maxDoc and may thus call this method with an offset that is + // beyond bitSet.length() + int length = Math.min(bitSet.length(), length() - offset); + if (length >= 0) { + andRange(this, offset, bitSet, 0, length); + } + if (length < bitSet.length() + && bitSet.nextSetBit(Math.max(0, length)) != DocIdSetIterator.NO_MORE_DOCS) { + throw new IllegalArgumentException("Some bits are set beyond the end of live docs"); + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java index b19e17f897c2..39acd5ea209e 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java @@ -17,9 +17,7 @@ package org.apache.lucene.util; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; import java.util.Random; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.tests.util.BaseBitSetTestCase; @@ -646,40 +644,72 @@ public void testScanIsEmpty() { } public void testOrRange() { - FixedBitSet set1 = new FixedBitSet(1_000); - FixedBitSet set2 = new FixedBitSet(10_000); - for (int i = 0; i < set2.length(); i += 3) { - set2.set(i); + FixedBitSet dest = new FixedBitSet(1_000); + FixedBitSet source = new FixedBitSet(10_000); + for (int i = 0; i < source.length(); i += 3) { + source.set(i); } - // Check different values of `offset` - List offsets = new ArrayList<>(); - for (int offset = 64; offset < 128; ++offset) { - // Test all possible alignments - offsets.add(offset); + // Test all possible alignments, and both a "short" (less than 64) and a long length. + for (int sourceFrom = 64; sourceFrom < 128; ++sourceFrom) { + for (int destFrom = 256; destFrom < 320; ++destFrom) { + for (int length : + new int[] { + 0, + TestUtil.nextInt(random(), 1, Long.SIZE - 1), + TestUtil.nextInt(random(), Long.SIZE, 512) + }) { + dest.clear(); + for (int i = 0; i < dest.length(); i += 10) { + dest.set(i); + } + FixedBitSet.orRange(source, sourceFrom, dest, destFrom, length); + for (int i = 0; i < dest.length(); ++i) { + boolean destSet = i % 10 == 0; + if (i < destFrom || i >= destFrom + length) { + // Outside of the range, unmodified + assertEquals("" + i, destSet, dest.get(i)); + } else { + boolean sourceSet = source.get(sourceFrom + (i - destFrom)); + assertEquals(sourceSet || destSet, dest.get(i)); + } + } + } + } } - for (int offset = set2.length() - 128; offset < set2.length() - 64; ++offset) { - // Again, test all possible alignments, but this time we stop or-ing bits when exceeding the - // size of set2 rather than set1 - offsets.add(offset); + } + + public void testAndRange() { + FixedBitSet dest = new FixedBitSet(1_000); + FixedBitSet source = new FixedBitSet(10_000); + for (int i = 0; i < source.length(); i += 3) { + source.set(i); } - for (int offset : offsets) { - set1.clear(); - for (int i = 0; i < set1.length(); i += 10) { - set1.set(i); - } - set1.orRange(set2, offset); - int upTo = Math.min(set1.length(), set2.length() - offset); - for (int i = 0; i < set1.length(); ++i) { - if (i % 10 == 0 || i >= upTo) { - // These bits were set before, they should still be set - assertEquals(i % 10 == 0, set1.get(i)); - } else if ((offset + i) % 3 == 0) { - // These bits were set in set1, should be set in set2 - assertTrue(set1.get(i)); - } else { - assertFalse(set1.get(i)); + // Test all possible alignments, and both a "short" (less than 64) and a long length. + for (int sourceFrom = 64; sourceFrom < 128; ++sourceFrom) { + for (int destFrom = 256; destFrom < 320; ++destFrom) { + for (int length : + new int[] { + 0, + TestUtil.nextInt(random(), 1, Long.SIZE - 1), + TestUtil.nextInt(random(), Long.SIZE, 512) + }) { + dest.clear(); + for (int i = 0; i < dest.length(); i += 2) { + dest.set(i); + } + FixedBitSet.andRange(source, sourceFrom, dest, destFrom, length); + for (int i = 0; i < dest.length(); ++i) { + boolean destSet = i % 2 == 0; + if (i < destFrom || i >= destFrom + length) { + // Outside of the range, unmodified + assertEquals("" + i, destSet, dest.get(i)); + } else { + boolean sourceSet = source.get(sourceFrom + (i - destFrom)); + assertEquals("" + i, sourceSet && destSet, dest.get(i)); + } + } } } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingLiveDocsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingLiveDocsFormat.java index f45ea821a555..e2152e45aa58 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingLiveDocsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingLiveDocsFormat.java @@ -24,6 +24,7 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; /** Just like the default live docs format but with additional asserts. */ public class AssertingLiveDocsFormat extends LiveDocsFormat { @@ -88,6 +89,12 @@ public int length() { return in.length(); } + @Override + public void applyMask(FixedBitSet bitSet, int offset) { + assert offset >= 0; + in.applyMask(bitSet, offset); + } + @Override public String toString() { return "Asserting(" + in + ")"; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java index 64ca9cca35f0..15c9c324732c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java @@ -1388,10 +1388,6 @@ private void verifyEnum( PostingsEnum pe2 = termsEnum.postings(null, flags); FixedBitSet set1 = new FixedBitSet(1024); FixedBitSet set2 = new FixedBitSet(1024); - FixedBitSet acceptDocs = new FixedBitSet(maxDoc); - for (int i = 0; i < maxDoc; i += 2) { - acceptDocs.set(i); - } while (true) { pe1.nextDoc(); @@ -1400,11 +1396,9 @@ private void verifyEnum( int offset = TestUtil.nextInt(random, Math.max(0, pe1.docID() - set1.length()), pe1.docID()); int upTo = offset + random.nextInt(set1.length()); - pe1.intoBitSet(acceptDocs, upTo, set1, offset); + pe1.intoBitSet(upTo, set1, offset); for (int d = pe2.docID(); d < upTo; d = pe2.nextDoc()) { - if (acceptDocs.get(d)) { - set2.set(d - offset); - } + set2.set(d - offset); } assertEquals(set1, set2); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java index 7200d4b5f4dc..9717f738e82e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/AssertingScorer.java @@ -24,7 +24,6 @@ import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; /** Wraps a Scorer with additional checks */ @@ -196,11 +195,10 @@ public long cost() { } @Override - public void intoBitSet(Bits acceptDocs, int upTo, FixedBitSet bitSet, int offset) - throws IOException { + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { assert docID() != -1; assert offset <= docID(); - in.intoBitSet(acceptDocs, upTo, bitSet, offset); + in.intoBitSet(upTo, bitSet, offset); assert docID() >= upTo; } }; From 26e5a8d6c32eea0f1a4e54557c1da9b1c1cc0e49 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 14 Jan 2025 19:02:29 +0100 Subject: [PATCH 20/88] Encode dense blocks of postings as bit sets. (#14133) Bit sets can be faster at advancing and more storage-efficient on dense blocks of postings. This is not a new idea, @mkhludnev proposed something similar a long time ago #6116. @msokolov recently brought up (#14080) that such an encoding has become especially appealing with the introduction of the `DocIdSetIterator#loadIntoBitSet` API, and the fact that non-scoring disjunctions and dense conjunctions now take advantage of it. Indeed, if postings are stored in a bit set, `#loadIntoBitSet` would just need to OR the postings bits into the bits that are used as an intermediate representation of matches of the query. --- .../checksums/generateForDeltaUtil.json | 4 +- .../lucene/codecs/lucene101/ForDeltaUtil.java | 74 +++----- .../lucene101/Lucene101PostingsFormat.java | 22 ++- .../lucene101/Lucene101PostingsReader.java | 166 +++++++++++++++--- .../lucene101/Lucene101PostingsWriter.java | 67 ++++++- .../codecs/lucene101/gen_ForDeltaUtil.py | 73 +++----- .../codecs/lucene101/TestForDeltaUtil.java | 7 +- .../TestLucene101PostingsFormatV0.java | 34 ++++ 8 files changed, 315 insertions(+), 132 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormatV0.java diff --git a/lucene/core/src/generated/checksums/generateForDeltaUtil.json b/lucene/core/src/generated/checksums/generateForDeltaUtil.json index 96f49fa81ff1..85765bbd7cbc 100644 --- a/lucene/core/src/generated/checksums/generateForDeltaUtil.json +++ b/lucene/core/src/generated/checksums/generateForDeltaUtil.json @@ -1,4 +1,4 @@ { - "lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "e0bf6071bcdefaa297e0bb92f79615201777652d", - "lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "d7484ab18da33e5cb73faaf84b4e2bb832b62f9d" + "lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "87e4d19b5284fa39adf2c24328cae2076b6f7bb3", + "lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "165586f801bef4d2f540521e81bc119880038b6c" } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java index 51b47a0a1f6d..ceec3ce3342a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java @@ -37,23 +37,6 @@ public final class ForDeltaUtil { private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2; private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4; - // IDENTITY_PLUS_ONE[i] == i+1 - private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE]; - - static { - for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { - IDENTITY_PLUS_ONE[i] = i + 1; - } - } - - private static void prefixSumOfOnes(int[] arr, int base) { - System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE); - // This loop gets auto-vectorized - for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { - arr[i] += base; - } - } - private static void prefixSum8(int[] arr, int base) { // When the number of bits per value is 4 or less, we can sum up all values in a block without // risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4 @@ -199,43 +182,35 @@ private static void innerPrefixSum16(int[] arr) { private final int[] tmp = new int[BLOCK_SIZE]; /** - * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code - * ints} are expected to be deltas between consecutive values. + * Return the number of bits per value required to store the given array containing strictly + * positive numbers. */ - void encodeDeltas(int[] ints, DataOutput out) throws IOException { - if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings - out.writeByte((byte) 0); - } else { - int or = 0; - for (int l : ints) { - or |= l; - } - assert or != 0; - final int bitsPerValue = PackedInts.bitsRequired(or); - out.writeByte((byte) bitsPerValue); - - final int primitiveSize; - if (bitsPerValue <= 3) { - primitiveSize = 8; - collapse8(ints); - } else if (bitsPerValue <= 10) { - primitiveSize = 16; - collapse16(ints); - } else { - primitiveSize = 32; - } - encode(ints, bitsPerValue, primitiveSize, out, tmp); + int bitsRequired(int[] ints) { + int or = 0; + for (int l : ints) { + or |= l; } + // Deltas should be strictly positive since the delta between consecutive doc IDs is at least 1 + assert or != 0; + return PackedInts.bitsRequired(or); } - /** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */ - void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException { - final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte()); - if (bitsPerValue == 0) { - prefixSumOfOnes(ints, base); + /** + * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code + * ints} are expected to be deltas between consecutive values. + */ + void encodeDeltas(int bitsPerValue, int[] ints, DataOutput out) throws IOException { + final int primitiveSize; + if (bitsPerValue <= 3) { + primitiveSize = 8; + collapse8(ints); + } else if (bitsPerValue <= 10) { + primitiveSize = 16; + collapse16(ints); } else { - decodeAndPrefixSum(bitsPerValue, pdu, base, ints); + primitiveSize = 32; } + encode(ints, bitsPerValue, primitiveSize, out, tmp); } /** Delta-decode 128 integers into {@code ints}. */ @@ -307,6 +282,9 @@ void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int prefixSum32(ints, base); break; default: + if (bitsPerValue < 1 || bitsPerValue > Integer.SIZE) { + throw new IllegalStateException("Illegal number of bits per value: " + bitsPerValue); + } decodeSlow(bitsPerValue, pdu, tmp, ints); prefixSum32(ints, base); break; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java index e228f1090ab8..d83111bb8fec 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java @@ -358,8 +358,17 @@ public final class Lucene101PostingsFormat extends PostingsFormat { static final String PAY_CODEC = "Lucene101PostingsWriterPay"; static final int VERSION_START = 0; - static final int VERSION_CURRENT = VERSION_START; + /** + * Version that started encoding dense blocks as bit sets. Note: the old format is a subset of the + * new format, so Lucene101PostingsReader is able to read the old format without checking the + * version. + */ + static final int VERSION_DENSE_BLOCKS_AS_BITSETS = 1; + + static final int VERSION_CURRENT = VERSION_DENSE_BLOCKS_AS_BITSETS; + + private final int version; private final int minTermBlockSize; private final int maxTermBlockSize; @@ -378,7 +387,16 @@ public Lucene101PostingsFormat() { * Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */ public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + this(minTermBlockSize, maxTermBlockSize, VERSION_CURRENT); + } + + /** Expert constructor that allows setting the version. */ + public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize, int version) { super("Lucene101"); + if (version < VERSION_START || version > VERSION_CURRENT) { + throw new IllegalArgumentException("Version out of range: " + version); + } + this.version = version; Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); this.minTermBlockSize = minTermBlockSize; this.maxTermBlockSize = maxTermBlockSize; @@ -386,7 +404,7 @@ public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state, version); boolean success = false; try { FieldsConsumer ret = diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java index 68df7683b28c..b4ccff69fed9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java @@ -295,12 +295,37 @@ private static int sumOverRange(int[] arr, int start, int end) { final class BlockPostingsEnum extends ImpactsEnum { + private enum DeltaEncoding { + /** + * Deltas between consecutive docs are stored as packed integers, ie. the block is encoded + * using Frame Of Reference (FOR). + */ + PACKED, + /** + * Deltas between consecutive docs are stored using unary coding, ie. {@code delta-1} zero + * bits followed by a one bit, ie. the block is encoded as an offset plus a bit set. + */ + UNARY + } + private ForDeltaUtil forDeltaUtil; private PForUtil pforUtil; + /* Variables that store the content of a block and the current position within this block */ + /* Shared variables */ + private DeltaEncoding encoding; + private int doc; // doc we last read + + /* Variables when the block is stored as packed deltas (Frame Of Reference) */ private final int[] docBuffer = new int[BLOCK_SIZE]; - private int doc; // doc we last read + /* Variables when the block is stored as a bit set */ + // Since we use a bit set when it's more storage-efficient, the bit set cannot have more than + // BLOCK_SIZE*32 bits, which is the maximum possible storage requirement with FOR. + private final FixedBitSet docBitSet = new FixedBitSet(BLOCK_SIZE * Integer.SIZE); + private int docBitSetBase; + // Reuse docBuffer for cumulative pop counts of the words of the bit set. + private final int[] docCumulativeWordPopCounts = docBuffer; // level 0 skip data private int level0LastDocID; @@ -572,7 +597,39 @@ public int freq() throws IOException { } private void refillFullBlock() throws IOException { - forDeltaUtil.decodeAndPrefixSum(docInUtil, prevDocID, docBuffer); + int bitsPerValue = docIn.readByte(); + if (bitsPerValue > 0) { + // block is encoded as 128 packed integers that record the delta between doc IDs + forDeltaUtil.decodeAndPrefixSum(bitsPerValue, docInUtil, prevDocID, docBuffer); + encoding = DeltaEncoding.PACKED; + } else { + // block is encoded as a bit set + assert level0LastDocID != NO_MORE_DOCS; + docBitSetBase = prevDocID + 1; + int numLongs; + if (bitsPerValue == 0) { + // 0 is used to record that all 128 docs in the block are consecutive + numLongs = BLOCK_SIZE / Long.SIZE; // 2 + docBitSet.set(0, BLOCK_SIZE); + } else { + numLongs = -bitsPerValue; + docIn.readLongs(docBitSet.getBits(), 0, numLongs); + } + // Note: we know that BLOCK_SIZE bits are set, so no need to compute the cumulative pop + // count at the last index, it will be BLOCK_SIZE. + // Note: this for loop auto-vectorizes + for (int i = 0; i < numLongs - 1; ++i) { + docCumulativeWordPopCounts[i] = Long.bitCount(docBitSet.getBits()[i]); + } + for (int i = 1; i < numLongs - 1; ++i) { + docCumulativeWordPopCounts[i] += docCumulativeWordPopCounts[i - 1]; + } + docCumulativeWordPopCounts[numLongs - 1] = BLOCK_SIZE; + assert docCumulativeWordPopCounts[numLongs - 2] + + Long.bitCount(docBitSet.getBits()[numLongs - 1]) + == BLOCK_SIZE; + encoding = DeltaEncoding.UNARY; + } if (indexHasFreq) { if (needsFreq) { freqFP = docIn.getFilePointer(); @@ -607,6 +664,7 @@ private void refillRemainder() throws IOException { prevDocID = docBuffer[BLOCK_SIZE - 1]; docBufferUpto = 0; posDocBufferUpto = 0; + encoding = DeltaEncoding.PACKED; assert docBuffer[docBufferSize] == NO_MORE_DOCS; } @@ -727,9 +785,10 @@ private void moveToNextLevel0Block() throws IOException { if (needsDocsAndFreqsOnly && docCountLeft >= BLOCK_SIZE) { // Optimize the common path for exhaustive evaluation long level0NumBytes = docIn.readVLong(); - docIn.skipBytes(level0NumBytes); + long level0End = docIn.getFilePointer() + level0NumBytes; + level0LastDocID += readVInt15(docIn); + docIn.seek(level0End); refillFullBlock(); - level0LastDocID = docBuffer[BLOCK_SIZE - 1]; } else { doMoveToNextLevel0Block(); } @@ -857,7 +916,19 @@ public int nextDoc() throws IOException { moveToNextLevel0Block(); } - return this.doc = docBuffer[docBufferUpto++]; + switch (encoding) { + case PACKED: + doc = docBuffer[docBufferUpto]; + break; + case UNARY: + int next = docBitSet.nextSetBit(doc - docBitSetBase + 1); + assert next != NO_MORE_DOCS; + doc = docBitSetBase + next; + break; + } + + ++docBufferUpto; + return this.doc; } @Override @@ -870,9 +941,30 @@ public int advance(int target) throws IOException { needsRefilling = false; } - int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); - this.doc = docBuffer[next]; - docBufferUpto = next + 1; + switch (encoding) { + case PACKED: + { + int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + this.doc = docBuffer[next]; + docBufferUpto = next + 1; + } + break; + case UNARY: + { + int next = docBitSet.nextSetBit(target - docBitSetBase); + assert next != NO_MORE_DOCS; + this.doc = docBitSetBase + next; + int wordIndex = next >> 6; + // Take the cumulative pop count for the given word, and subtract bits on the left of + // the current doc. + docBufferUpto = + 1 + + docCumulativeWordPopCounts[wordIndex] + - Long.bitCount(docBitSet.getBits()[wordIndex] >>> next); + } + break; + } + return doc; } @@ -891,19 +983,53 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept moveToNextLevel0Block(); } - int start = docBufferUpto; - int end = computeBufferEndBoundary(upTo); - if (end != 0) { - bufferIntoBitSet(start, end, bitSet, offset); - doc = docBuffer[end - 1]; - } - docBufferUpto = end; + switch (encoding) { + case PACKED: + { + int start = docBufferUpto; + int end = computeBufferEndBoundary(upTo); + if (end != 0) { + bufferIntoBitSet(start, end, bitSet, offset); + doc = docBuffer[end - 1]; + } + docBufferUpto = end; + if (end != BLOCK_SIZE) { + // Either the block is a tail block, or the block did not fully match, we're done. + nextDoc(); + assert doc >= upTo; + return; + } + } + break; + case UNARY: + { + int sourceFrom; + if (docBufferUpto == 0) { + // start from beginning + sourceFrom = 0; + } else { + // start after the current doc + sourceFrom = doc - docBitSetBase + 1; + } - if (end != BLOCK_SIZE) { - // Either the block is a tail block, or the block did not fully match, we're done. - nextDoc(); - assert doc >= upTo; - break; + int destFrom = docBitSetBase - offset + sourceFrom; + + assert level0LastDocID != NO_MORE_DOCS; + int sourceTo = Math.min(upTo, level0LastDocID + 1) - docBitSetBase; + + if (sourceTo > sourceFrom) { + FixedBitSet.orRange(docBitSet, sourceFrom, bitSet, destFrom, sourceTo - sourceFrom); + } + if (docBitSetBase + sourceTo <= level0LastDocID) { + // We stopped before the end of the current bit set, which means that we're done. + // Set the current doc before returning. + advance(docBitSetBase + sourceTo); + return; + } + doc = level0LastDocID; + docBufferUpto = BLOCK_SIZE; + } + break; } } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java index 788a5515f2d1..1cabefe681ef 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java @@ -16,16 +16,16 @@ */ package org.apache.lucene.codecs.lucene101; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.*; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.LEVEL1_MASK; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.META_CODEC; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.POS_CODEC; import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.VERSION_CURRENT; import java.io.IOException; +import java.util.Arrays; import java.util.Collection; import java.util.List; import org.apache.lucene.codecs.BlockTermState; @@ -46,6 +46,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; /** Writer for {@link Lucene101PostingsFormat}. */ @@ -53,6 +54,8 @@ public class Lucene101PostingsWriter extends PushPostingsWriterBase { static final IntBlockTermState EMPTY_STATE = new IntBlockTermState(); + private final int version; + IndexOutput metaOut; IndexOutput docOut; IndexOutput posOut; @@ -124,8 +127,22 @@ public class Lucene101PostingsWriter extends PushPostingsWriterBase { */ private final ByteBuffersDataOutput level1Output = ByteBuffersDataOutput.newResettableInstance(); - /** Sole constructor. */ + /** + * Reusable FixedBitSet, for dense blocks that are more efficiently stored by storing them as a + * bit set than as packed deltas. + */ + // Since we use a bit set when it's more storage-efficient, the bit set cannot have more than + // BLOCK_SIZE*32 bits, which is the maximum possible storage requirement with FOR. + private final FixedBitSet spareBitSet = new FixedBitSet(BLOCK_SIZE * Integer.SIZE); + + /** Sole public constructor. */ public Lucene101PostingsWriter(SegmentWriteState state) throws IOException { + this(state, Lucene101PostingsFormat.VERSION_CURRENT); + } + + /** Constructor that takes a version. */ + Lucene101PostingsWriter(SegmentWriteState state, int version) throws IOException { + this.version = version; String metaFileName = IndexFileNames.segmentFileName( state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.META_EXTENSION); @@ -139,9 +156,9 @@ public Lucene101PostingsWriter(SegmentWriteState state) throws IOException { try { docOut = state.directory.createOutput(docFileName, state.context); CodecUtil.writeIndexHeader( - metaOut, META_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + metaOut, META_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); CodecUtil.writeIndexHeader( - docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + docOut, DOC_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); forDeltaUtil = new ForDeltaUtil(); pforUtil = new PForUtil(); if (state.fieldInfos.hasProx()) { @@ -151,7 +168,7 @@ public Lucene101PostingsWriter(SegmentWriteState state) throws IOException { state.segmentInfo.name, state.segmentSuffix, Lucene101PostingsFormat.POS_EXTENSION); posOut = state.directory.createOutput(posFileName, state.context); CodecUtil.writeIndexHeader( - posOut, POS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + posOut, POS_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); if (state.fieldInfos.hasPayloads()) { payloadBytes = new byte[128]; @@ -177,7 +194,7 @@ public Lucene101PostingsWriter(SegmentWriteState state) throws IOException { Lucene101PostingsFormat.PAY_EXTENSION); payOut = state.directory.createOutput(payFileName, state.context); CodecUtil.writeIndexHeader( - payOut, PAY_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + payOut, PAY_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); } } else { posDeltaBuffer = null; @@ -207,7 +224,7 @@ public IntBlockTermState newTermState() { @Override public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { CodecUtil.writeIndexHeader( - termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + termsOut, TERMS_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); termsOut.writeVInt(BLOCK_SIZE); } @@ -405,7 +422,39 @@ private void flushDocBlock(boolean finishTerm) throws IOException { } } long numSkipBytes = level0Output.size(); - forDeltaUtil.encodeDeltas(docDeltaBuffer, level0Output); + // Now we need to decide whether to encode block deltas as packed integers (FOR) or unary + // codes (bit set). FOR makes #nextDoc() a bit faster while the bit set approach makes + // #advance() sometimes faster and #intoBitSet() much faster. Since the trade-off is not + // obvious, we make the decision purely based on storage efficiency, using the approach that + // requires fewer bits to encode the block. + int bitsPerValue = forDeltaUtil.bitsRequired(docDeltaBuffer); + int sum = Math.toIntExact(Arrays.stream(docDeltaBuffer).sum()); + int numBitSetLongs = FixedBitSet.bits2words(sum); + if (sum == BLOCK_SIZE) { + level0Output.writeByte((byte) 0); + } else if (version < VERSION_DENSE_BLOCKS_AS_BITSETS || bitsPerValue * BLOCK_SIZE < sum) { + level0Output.writeByte((byte) bitsPerValue); + forDeltaUtil.encodeDeltas(bitsPerValue, docDeltaBuffer, level0Output); + } else { + // Storing doc deltas is more efficient using unary coding (ie. storing doc IDs as a bit + // set) + spareBitSet.clear(0, numBitSetLongs << 6); + int s = -1; + for (int i : docDeltaBuffer) { + s += i; + spareBitSet.set(s); + } + // Since we use the bit set encoding when it's more storage efficient than storing deltas, + // we know that each doc ID uses less than 32 bits, the maximum number of bits required to + // store a delta between consecutive doc IDs. So in the end, the bit set cannot have more + // than BLOCK_SIZE * Integer.SIZE / Long.SIZE = 64 longs, which fits on a byte. + assert numBitSetLongs <= BLOCK_SIZE / 2; + level0Output.writeByte((byte) -numBitSetLongs); + for (int i = 0; i < numBitSetLongs; ++i) { + level0Output.writeLong(spareBitSet.getBits()[i]); + } + } + if (writeFreqs) { pforUtil.encode(freqBuffer, level0Output); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py index 3214aa671002..b1b36db096a7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py @@ -63,23 +63,6 @@ private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2; private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4; - // IDENTITY_PLUS_ONE[i] == i+1 - private static final int[] IDENTITY_PLUS_ONE = new int[ForUtil.BLOCK_SIZE]; - - static { - for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { - IDENTITY_PLUS_ONE[i] = i + 1; - } - } - - private static void prefixSumOfOnes(int[] arr, int base) { - System.arraycopy(IDENTITY_PLUS_ONE, 0, arr, 0, ForUtil.BLOCK_SIZE); - // This loop gets auto-vectorized - for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { - arr[i] += base; - } - } - private static void prefixSum8(int[] arr, int base) { // When the number of bits per value is 4 or less, we can sum up all values in a block without // risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4 @@ -224,44 +207,33 @@ private final int[] tmp = new int[BLOCK_SIZE]; + /** Return the number of bits per value required to store the given array containing strictly positive numbers. */ + int bitsRequired(int[] ints) { + int or = 0; + for (int l : ints) { + or |= l; + } + // Deltas should be strictly positive since the delta between consecutive doc IDs is at least 1 + assert or != 0; + return PackedInts.bitsRequired(or); + } + /** * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code * ints} are expected to be deltas between consecutive values. */ - void encodeDeltas(int[] ints, DataOutput out) throws IOException { - if (ints[0] == 1 && PForUtil.allEqual(ints)) { // happens with very dense postings - out.writeByte((byte) 0); - } else { - int or = 0; - for (int l : ints) { - or |= l; - } - assert or != 0; - final int bitsPerValue = PackedInts.bitsRequired(or); - out.writeByte((byte) bitsPerValue); - - final int primitiveSize; - if (bitsPerValue <= 3) { - primitiveSize = 8; - collapse8(ints); - } else if (bitsPerValue <= 10) { - primitiveSize = 16; - collapse16(ints); - } else { - primitiveSize = 32; - } - encode(ints, bitsPerValue, primitiveSize, out, tmp); - } - } - - /** Decode deltas, compute the prefix sum and add {@code base} to all decoded ints. */ - void decodeAndPrefixSum(PostingDecodingUtil pdu, int base, int[] ints) throws IOException { - final int bitsPerValue = Byte.toUnsignedInt(pdu.in.readByte()); - if (bitsPerValue == 0) { - prefixSumOfOnes(ints, base); + void encodeDeltas(int bitsPerValue, int[] ints, DataOutput out) throws IOException { + final int primitiveSize; + if (bitsPerValue <= 3) { + primitiveSize = 8; + collapse8(ints); + } else if (bitsPerValue <= 10) { + primitiveSize = 16; + collapse16(ints); } else { - decodeAndPrefixSum(bitsPerValue, pdu, base, ints); + primitiveSize = 32; } + encode(ints, bitsPerValue, primitiveSize, out, tmp); } """ @@ -361,6 +333,9 @@ def writeDecode(bpv, f): f.write(' prefixSum%d(ints, base);\n' %primitive_size) f.write(' break;\n') f.write(' default:\n') + f.write(' if (bitsPerValue < 1 || bitsPerValue > Integer.SIZE) {\n') + f.write(' throw new IllegalStateException("Illegal number of bits per value: " + bitsPerValue);\n') + f.write(' }\n') f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n') f.write(' prefixSum32(ints, base);\n') f.write(' break;\n') diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java index 3e346f3eb206..d41ab472ea60 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java @@ -56,7 +56,9 @@ public void testEncodeDecode() throws IOException { for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { source[j] = values[i * ForUtil.BLOCK_SIZE + j]; } - forDeltaUtil.encodeDeltas(source, out); + int bitsPerValue = forDeltaUtil.bitsRequired(source); + out.writeByte((byte) bitsPerValue); + forDeltaUtil.encodeDeltas(bitsPerValue, source, out); } endPointer = out.getFilePointer(); out.close(); @@ -71,7 +73,8 @@ public void testEncodeDecode() throws IOException { for (int i = 0; i < iterations; ++i) { int base = 0; final int[] restored = new int[ForUtil.BLOCK_SIZE]; - forDeltaUtil.decodeAndPrefixSum(pdu, base, restored); + int bitsPerValue = pdu.in.readByte(); + forDeltaUtil.decodeAndPrefixSum(bitsPerValue, pdu, base, restored); final int[] expected = new int[ForUtil.BLOCK_SIZE]; for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { expected[j] = values[i * ForUtil.BLOCK_SIZE + j]; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormatV0.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormatV0.java new file mode 100644 index 000000000000..037527413ea8 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormatV0.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene101; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.tests.index.BasePostingsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; + +public class TestLucene101PostingsFormatV0 extends BasePostingsFormatTestCase { + + @Override + protected Codec getCodec() { + return TestUtil.alwaysPostingsFormat( + new Lucene101PostingsFormat( + Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, + Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE, + Lucene101PostingsFormat.VERSION_START)); + } +} From 245acc8db4934923efff0a7d49271b8c0501b92d Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 14 Jan 2025 19:03:56 +0100 Subject: [PATCH 21/88] Add CHANGES for #14133. --- lucene/CHANGES.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 31ddd83d907e..c0a73e1bbedb 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -65,6 +65,9 @@ Optimizations * GITHUB#14080: Use the `DocIdSetIterator#loadIntoBitSet` API to speed up dense conjunctions. (Adrien Grand) +* GITHUB#14133: Dense blocks of postings are now encoded as bit sets. + (Adrien Grand) + Bug Fixes --------------------- From c1cbb2254554ca3f64a09555aea4f06422e14d51 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Wed, 15 Jan 2025 08:39:05 +0100 Subject: [PATCH 22/88] Remove SingleValueDocValuesFieldUpdates abstract class (#14059) This abstract class has currently one implementation so this removes this indirection. --- .../lucene/index/DocValuesFieldUpdates.java | 106 ------------------ .../index/NumericDocValuesFieldUpdates.java | 92 ++++++++++++++- 2 files changed, 88 insertions(+), 110 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java index 96da8625c444..ffcb9f07c9b1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesFieldUpdates.java @@ -20,13 +20,10 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.BitSet; -import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntroSorter; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PagedMutable; @@ -480,107 +477,4 @@ final boolean hasValue() { return hasValue; } } - - abstract static class SingleValueDocValuesFieldUpdates extends DocValuesFieldUpdates { - private final BitSet bitSet; - private BitSet hasNoValue; - private boolean hasAtLeastOneValue; - - protected SingleValueDocValuesFieldUpdates( - int maxDoc, long delGen, String field, DocValuesType type) { - super(maxDoc, delGen, field, type); - this.bitSet = new SparseFixedBitSet(maxDoc); - } - - @Override - void add(int doc, long value) { - assert longValue() == value; - bitSet.set(doc); - this.hasAtLeastOneValue = true; - if (hasNoValue != null) { - hasNoValue.clear(doc); - } - } - - @Override - void add(int doc, BytesRef value) { - assert binaryValue().equals(value); - bitSet.set(doc); - this.hasAtLeastOneValue = true; - if (hasNoValue != null) { - hasNoValue.clear(doc); - } - } - - @Override - synchronized void reset(int doc) { - bitSet.set(doc); - this.hasAtLeastOneValue = true; - if (hasNoValue == null) { - hasNoValue = new SparseFixedBitSet(maxDoc); - } - hasNoValue.set(doc); - } - - @Override - void add(int docId, Iterator iterator) { - throw new UnsupportedOperationException(); - } - - protected abstract BytesRef binaryValue(); - - protected abstract long longValue(); - - @Override - synchronized boolean any() { - return super.any() || hasAtLeastOneValue; - } - - @Override - public long ramBytesUsed() { - return super.ramBytesUsed() - + bitSet.ramBytesUsed() - + (hasNoValue == null ? 0 : hasNoValue.ramBytesUsed()); - } - - @Override - Iterator iterator() { - BitSetIterator iterator = new BitSetIterator(bitSet, maxDoc); - return new DocValuesFieldUpdates.Iterator() { - - @Override - public int docID() { - return iterator.docID(); - } - - @Override - public int nextDoc() { - return iterator.nextDoc(); - } - - @Override - long longValue() { - return SingleValueDocValuesFieldUpdates.this.longValue(); - } - - @Override - BytesRef binaryValue() { - return SingleValueDocValuesFieldUpdates.this.binaryValue(); - } - - @Override - long delGen() { - return delGen; - } - - @Override - boolean hasValue() { - if (hasNoValue != null) { - return hasNoValue.get(docID()) == false; - } - return true; - } - }; - } - } } diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesFieldUpdates.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesFieldUpdates.java index d58a12c88253..a3c14486fbda 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesFieldUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesFieldUpdates.java @@ -17,8 +17,11 @@ package org.apache.lucene.index; import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.packed.AbstractPagedMutable; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PagedGrowableWriter; @@ -130,23 +133,104 @@ public long ramBytesUsed() { + RamUsageEstimator.NUM_BYTES_OBJECT_REF; } - static class SingleValueNumericDocValuesFieldUpdates extends SingleValueDocValuesFieldUpdates { + static class SingleValueNumericDocValuesFieldUpdates extends DocValuesFieldUpdates { private final long value; + private final BitSet bitSet; + private BitSet hasNoValue; + private boolean hasAtLeastOneValue; SingleValueNumericDocValuesFieldUpdates(long delGen, String field, int maxDoc, long value) { super(maxDoc, delGen, field, DocValuesType.NUMERIC); + this.bitSet = new SparseFixedBitSet(maxDoc); this.value = value; } + // pkg private for testing + long longValue() { + return value; + } + @Override - protected BytesRef binaryValue() { + void add(int doc, long value) { + assert this.value == value; + bitSet.set(doc); + this.hasAtLeastOneValue = true; + if (hasNoValue != null) { + hasNoValue.clear(doc); + } + } + + @Override + void add(int doc, BytesRef value) { throw new UnsupportedOperationException(); } @Override - protected long longValue() { - return value; + synchronized void reset(int doc) { + bitSet.set(doc); + this.hasAtLeastOneValue = true; + if (hasNoValue == null) { + hasNoValue = new SparseFixedBitSet(maxDoc); + } + hasNoValue.set(doc); + } + + @Override + void add(int docId, Iterator iterator) { + throw new UnsupportedOperationException(); + } + + @Override + synchronized boolean any() { + return super.any() || hasAtLeastOneValue; + } + + @Override + public long ramBytesUsed() { + return super.ramBytesUsed() + + bitSet.ramBytesUsed() + + (hasNoValue == null ? 0 : hasNoValue.ramBytesUsed()); + } + + @Override + Iterator iterator() { + BitSetIterator iterator = new BitSetIterator(bitSet, maxDoc); + return new DocValuesFieldUpdates.Iterator() { + + @Override + public int docID() { + return iterator.docID(); + } + + @Override + public int nextDoc() { + return iterator.nextDoc(); + } + + @Override + long longValue() { + return value; + } + + @Override + BytesRef binaryValue() { + throw new UnsupportedOperationException(); + } + + @Override + long delGen() { + return delGen; + } + + @Override + boolean hasValue() { + if (hasNoValue != null) { + return hasNoValue.get(docID()) == false; + } + return true; + } + }; } } } From b87757c2d2f7aa9847268b058c59c2d2ef11a9cf Mon Sep 17 00:00:00 2001 From: Craig Perkins Date: Wed, 15 Jan 2025 07:21:32 -0500 Subject: [PATCH 23/88] Complete the javadoc for DirectoryReader#indexExists (#14136) Thank you! --- .../core/src/java/org/apache/lucene/index/DirectoryReader.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java index d50ae3a85cba..7571a7f0d074 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java @@ -305,7 +305,8 @@ public static List listCommits(Directory dir) throws IOException { /** * Returns true if an index likely exists at the specified directory. Note that if a - * corrupt index exists, or if an index in the process of committing + * corrupt index exists, or if an index in the process of committing the return value is not + * reliable. * * @param directory the directory to check for an index * @return true if an index exists; false otherwise From 34a732f1935e75f0b41c0262a7fcf3a8e4d16565 Mon Sep 17 00:00:00 2001 From: Clay Johnson Date: Wed, 15 Jan 2025 06:38:37 -0600 Subject: [PATCH 24/88] Publish build scans to develocity.apache.org (#14140) * Publish build scans to develocity.apache.org * Update Develocity plugin versions * Use `DEVELOCITY_ACCESS_KEY` to authenticate to `develocity.apache.org` --- .github/workflows/run-checks-all.yml | 2 +- .github/workflows/run-checks-gradle-upgrade.yml | 2 +- .github/workflows/run-checks-mod-analysis-common.yml | 2 +- .../workflows/run-checks-mod-distribution.tests.yml | 2 +- README.md | 2 +- gradle/ge.gradle | 12 ++++++------ settings.gradle | 4 ++-- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/run-checks-all.yml b/.github/workflows/run-checks-all.yml index 18dd308e9a77..7242a57dbeb6 100644 --- a/.github/workflows/run-checks-all.yml +++ b/.github/workflows/run-checks-all.yml @@ -13,7 +13,7 @@ on: - 'branch_10x' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ DEVELOCITY_ACCESS_KEY }} # We split the workflow into two parallel jobs for efficiency: # one is running all validation checks without tests, diff --git a/.github/workflows/run-checks-gradle-upgrade.yml b/.github/workflows/run-checks-gradle-upgrade.yml index 07b7210cf4e2..0eefce64a23b 100644 --- a/.github/workflows/run-checks-gradle-upgrade.yml +++ b/.github/workflows/run-checks-gradle-upgrade.yml @@ -20,7 +20,7 @@ on: - 'gradle/wrapper/**' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ DEVELOCITY_ACCESS_KEY }} jobs: gradleSanityCheck: diff --git a/.github/workflows/run-checks-mod-analysis-common.yml b/.github/workflows/run-checks-mod-analysis-common.yml index a208039a99fa..8b9cde59ef46 100644 --- a/.github/workflows/run-checks-mod-analysis-common.yml +++ b/.github/workflows/run-checks-mod-analysis-common.yml @@ -20,7 +20,7 @@ on: - 'lucene/analysis/common/**' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ DEVELOCITY_ACCESS_KEY }} jobs: test: diff --git a/.github/workflows/run-checks-mod-distribution.tests.yml b/.github/workflows/run-checks-mod-distribution.tests.yml index e3af5812c80c..646c048bdd03 100644 --- a/.github/workflows/run-checks-mod-distribution.tests.yml +++ b/.github/workflows/run-checks-mod-distribution.tests.yml @@ -14,7 +14,7 @@ on: - 'branch_10x' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ DEVELOCITY_ACCESS_KEY }} jobs: test: diff --git a/README.md b/README.md index c613a16986ea..c2c963ef50e3 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Apache Lucene is a high-performance, full-featured text search engine library written in Java. [![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/) -[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://ge.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root) +[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://develocity.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root) ## Online Documentation diff --git a/gradle/ge.gradle b/gradle/ge.gradle index f6bba24f23f5..c4677859e33a 100644 --- a/gradle/ge.gradle +++ b/gradle/ge.gradle @@ -17,13 +17,13 @@ def isCIBuild = System.getenv().keySet().find { it ==~ /(?i)((JENKINS|HUDSON)(_\w+)?|CI)/ } != null -gradleEnterprise { - server = "https://ge.apache.org" +develocity { + server = "https://develocity.apache.org" + projectId = "lucene" + buildScan { - capture { taskInputFiles = true } uploadInBackground = !isCIBuild - publishAlways() - publishIfAuthenticated() + publishing.onlyIf { it.isAuthenticated() } obfuscation { ipAddresses { addresses -> addresses.collect { address -> "0.0.0.0"} } } @@ -35,7 +35,7 @@ buildCache { enabled = !isCIBuild } - remote(gradleEnterprise.buildCache) { + remote(develocity.buildCache) { enabled = false } } diff --git a/settings.gradle b/settings.gradle index f4ee13243ca6..8543bab1619f 100644 --- a/settings.gradle +++ b/settings.gradle @@ -26,8 +26,8 @@ pluginManagement { plugins { id "org.gradle.toolchains.foojay-resolver-convention" version "0.8.0" - id 'com.gradle.enterprise' version '3.15.1' - id 'com.gradle.common-custom-user-data-gradle-plugin' version '1.11.3' + id 'com.gradle.develocity' version '3.18.2' + id 'com.gradle.common-custom-user-data-gradle-plugin' version '2.0.2' } dependencyResolutionManagement { From 905efa9e2af8ba4b808effe6b635663c9e601201 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Wed, 15 Jan 2025 13:56:20 +0100 Subject: [PATCH 25/88] Revert "Publish build scans to develocity.apache.org (#14140)" This reverts commit 34a732f1935e75f0b41c0262a7fcf3a8e4d16565. --- .github/workflows/run-checks-all.yml | 2 +- .github/workflows/run-checks-gradle-upgrade.yml | 2 +- .github/workflows/run-checks-mod-analysis-common.yml | 2 +- .../workflows/run-checks-mod-distribution.tests.yml | 2 +- README.md | 2 +- gradle/ge.gradle | 12 ++++++------ settings.gradle | 4 ++-- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/run-checks-all.yml b/.github/workflows/run-checks-all.yml index 7242a57dbeb6..18dd308e9a77 100644 --- a/.github/workflows/run-checks-all.yml +++ b/.github/workflows/run-checks-all.yml @@ -13,7 +13,7 @@ on: - 'branch_10x' env: - DEVELOCITY_ACCESS_KEY: ${{ DEVELOCITY_ACCESS_KEY }} + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} # We split the workflow into two parallel jobs for efficiency: # one is running all validation checks without tests, diff --git a/.github/workflows/run-checks-gradle-upgrade.yml b/.github/workflows/run-checks-gradle-upgrade.yml index 0eefce64a23b..07b7210cf4e2 100644 --- a/.github/workflows/run-checks-gradle-upgrade.yml +++ b/.github/workflows/run-checks-gradle-upgrade.yml @@ -20,7 +20,7 @@ on: - 'gradle/wrapper/**' env: - DEVELOCITY_ACCESS_KEY: ${{ DEVELOCITY_ACCESS_KEY }} + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} jobs: gradleSanityCheck: diff --git a/.github/workflows/run-checks-mod-analysis-common.yml b/.github/workflows/run-checks-mod-analysis-common.yml index 8b9cde59ef46..a208039a99fa 100644 --- a/.github/workflows/run-checks-mod-analysis-common.yml +++ b/.github/workflows/run-checks-mod-analysis-common.yml @@ -20,7 +20,7 @@ on: - 'lucene/analysis/common/**' env: - DEVELOCITY_ACCESS_KEY: ${{ DEVELOCITY_ACCESS_KEY }} + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} jobs: test: diff --git a/.github/workflows/run-checks-mod-distribution.tests.yml b/.github/workflows/run-checks-mod-distribution.tests.yml index 646c048bdd03..e3af5812c80c 100644 --- a/.github/workflows/run-checks-mod-distribution.tests.yml +++ b/.github/workflows/run-checks-mod-distribution.tests.yml @@ -14,7 +14,7 @@ on: - 'branch_10x' env: - DEVELOCITY_ACCESS_KEY: ${{ DEVELOCITY_ACCESS_KEY }} + GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} jobs: test: diff --git a/README.md b/README.md index c2c963ef50e3..c613a16986ea 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Apache Lucene is a high-performance, full-featured text search engine library written in Java. [![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/) -[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://develocity.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root) +[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://ge.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root) ## Online Documentation diff --git a/gradle/ge.gradle b/gradle/ge.gradle index c4677859e33a..f6bba24f23f5 100644 --- a/gradle/ge.gradle +++ b/gradle/ge.gradle @@ -17,13 +17,13 @@ def isCIBuild = System.getenv().keySet().find { it ==~ /(?i)((JENKINS|HUDSON)(_\w+)?|CI)/ } != null -develocity { - server = "https://develocity.apache.org" - projectId = "lucene" - +gradleEnterprise { + server = "https://ge.apache.org" buildScan { + capture { taskInputFiles = true } uploadInBackground = !isCIBuild - publishing.onlyIf { it.isAuthenticated() } + publishAlways() + publishIfAuthenticated() obfuscation { ipAddresses { addresses -> addresses.collect { address -> "0.0.0.0"} } } @@ -35,7 +35,7 @@ buildCache { enabled = !isCIBuild } - remote(develocity.buildCache) { + remote(gradleEnterprise.buildCache) { enabled = false } } diff --git a/settings.gradle b/settings.gradle index 8543bab1619f..f4ee13243ca6 100644 --- a/settings.gradle +++ b/settings.gradle @@ -26,8 +26,8 @@ pluginManagement { plugins { id "org.gradle.toolchains.foojay-resolver-convention" version "0.8.0" - id 'com.gradle.develocity' version '3.18.2' - id 'com.gradle.common-custom-user-data-gradle-plugin' version '2.0.2' + id 'com.gradle.enterprise' version '3.15.1' + id 'com.gradle.common-custom-user-data-gradle-plugin' version '1.11.3' } dependencyResolutionManagement { From 34f0453283a45acac366539d08d47a5e8939204a Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Wed, 15 Jan 2025 09:08:29 -0500 Subject: [PATCH 26/88] Add two new "Seeded" Knn queries for seeded vector search (#14084) ### Description In some vector search cases, users may already know some documents that are likely related to a query. Let's support seeding HNSW's scoring stage with these documents, rather than using HNSW's hierarchical stage. An example use case is hybrid search, where both a traditional and vector search are performed. The top results from the traditional search are likely reasonable seeds for the vector search. Even when not performing hybrid search, traditional matching can often be faster than traversing the hierarchy, which can be used to speed up the vector search process (up to 2x faster for the same effectiveness), as was demonstrated in [this article](https://arxiv.org/abs/2307.16779) (full disclosure: seanmacavaney is an author of the article). The main changes are: - A new "seeded" focused knn collector and collector manager - Two new basic knn queries that expose using these specialized collectors for seeded entrypoint - `HnswGraphSearcher`, which bypasses the `findBestEntryPoint` step if seeds are provided. //cc @seanmacavaney Co-authored-by: Sean MacAvaney Co-authored-by: Sean MacAvaney Co-authored-by: Christine Poerschke --- lucene/CHANGES.txt | 6 +- .../lucene/search/KnnByteVectorQuery.java | 2 +- .../apache/lucene/search/KnnCollector.java | 54 +++++ .../lucene/search/KnnFloatVectorQuery.java | 2 +- .../search/SeededKnnByteVectorQuery.java | 97 +++++++++ .../search/SeededKnnFloatVectorQuery.java | 97 +++++++++ .../TimeLimitingKnnCollectorManager.java | 42 +--- .../lucene/search/knn/EntryPointProvider.java | 28 +++ .../lucene/search/knn/SeededKnnCollector.java | 48 ++++ .../search/knn/SeededKnnCollectorManager.java | 177 +++++++++++++++ .../lucene/util/hnsw/HnswGraphSearcher.java | 31 ++- .../hnsw/OrdinalTranslatedKnnCollector.java | 42 +--- .../lucene/document/TestManyKnnDocs.java | 136 +++++++++++- .../lucene/search/TestKnnByteVectorQuery.java | 4 +- .../search/TestKnnFloatVectorQuery.java | 2 +- .../search/TestSeededKnnByteVectorQuery.java | 205 ++++++++++++++++++ .../search/TestSeededKnnFloatVectorQuery.java | 191 ++++++++++++++++ 17 files changed, 1075 insertions(+), 89 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/search/SeededKnnByteVectorQuery.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/SeededKnnFloatVectorQuery.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/knn/EntryPointProvider.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollector.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollectorManager.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestSeededKnnByteVectorQuery.java create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestSeededKnnFloatVectorQuery.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c0a73e1bbedb..5084c25f3560 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -46,7 +46,11 @@ API Changes New Features --------------------- -(No changes) + +* GITHUB#14084, GITHUB#13635, GITHUB#13634: Adds new `SeededKnnByteVectorQuery` and `SeededKnnFloatVectorQuery` + queries. These queries allow for the vector search entry points to be initialized via a `seed` query. This follows + the research provided via https://arxiv.org/abs/2307.16779. (Sean MacAvaney, Ben Trent). + Improvements --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java index 35144055830c..05157ab65cb5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/KnnByteVectorQuery.java @@ -46,7 +46,7 @@ public class KnnByteVectorQuery extends AbstractKnnVectorQuery { private static final TopDocs NO_RESULTS = TopDocsCollector.EMPTY_TOPDOCS; - private final byte[] target; + protected final byte[] target; /** * Find the k nearest documents to the target vector according to the vectors in the diff --git a/lucene/core/src/java/org/apache/lucene/search/KnnCollector.java b/lucene/core/src/java/org/apache/lucene/search/KnnCollector.java index 43bac9fbc309..f694d8f7085c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/KnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/KnnCollector.java @@ -85,4 +85,58 @@ public interface KnnCollector { * @return The collected top documents */ TopDocs topDocs(); + + /** + * KnnCollector.Decorator is the base class for decorators of KnnCollector objects, which extend + * the object with new behaviors. + * + * @lucene.experimental + */ + abstract class Decorator implements KnnCollector { + private final KnnCollector collector; + + public Decorator(KnnCollector collector) { + this.collector = collector; + } + + @Override + public boolean earlyTerminated() { + return collector.earlyTerminated(); + } + + @Override + public void incVisitedCount(int count) { + collector.incVisitedCount(count); + } + + @Override + public long visitedCount() { + return collector.visitedCount(); + } + + @Override + public long visitLimit() { + return collector.visitLimit(); + } + + @Override + public int k() { + return collector.k(); + } + + @Override + public boolean collect(int docId, float similarity) { + return collector.collect(docId, similarity); + } + + @Override + public float minCompetitiveSimilarity() { + return collector.minCompetitiveSimilarity(); + } + + @Override + public TopDocs topDocs() { + return collector.topDocs(); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java index d2aaf4296eda..c7d6fdb3608d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/KnnFloatVectorQuery.java @@ -47,7 +47,7 @@ public class KnnFloatVectorQuery extends AbstractKnnVectorQuery { private static final TopDocs NO_RESULTS = TopDocsCollector.EMPTY_TOPDOCS; - private final float[] target; + protected final float[] target; /** * Find the k nearest documents to the target vector according to the vectors in the diff --git a/lucene/core/src/java/org/apache/lucene/search/SeededKnnByteVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/SeededKnnByteVectorQuery.java new file mode 100644 index 000000000000..980b6869c34f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/SeededKnnByteVectorQuery.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Objects; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.search.knn.KnnCollectorManager; +import org.apache.lucene.search.knn.SeededKnnCollectorManager; + +/** + * This is a version of knn byte vector query that provides a query seed to initiate the vector + * search. NOTE: The underlying format is free to ignore the provided seed + * + *

    See "Lexically-Accelerated Dense + * Retrieval" (Kulkarni, Hrishikesh and MacAvaney, Sean and Goharian, Nazli and Frieder, Ophir). + * In SIGIR '23: Proceedings of the 46th International ACM SIGIR Conference on Research and + * Development in Information Retrieval Pages 152 - 162 + * + * @lucene.experimental + */ +public class SeededKnnByteVectorQuery extends KnnByteVectorQuery { + final Query seed; + final Weight seedWeight; + + /** + * Construct a new SeededKnnByteVectorQuery instance + * + * @param field knn byte vector field to query + * @param target the query vector + * @param k number of neighbors to return + * @param filter a filter on the neighbors to return + * @param seed a query seed to initiate the vector format search + */ + public SeededKnnByteVectorQuery(String field, byte[] target, int k, Query filter, Query seed) { + super(field, target, k, filter); + this.seed = Objects.requireNonNull(seed); + this.seedWeight = null; + } + + SeededKnnByteVectorQuery(String field, byte[] target, int k, Query filter, Weight seedWeight) { + super(field, target, k, filter); + this.seed = null; + this.seedWeight = Objects.requireNonNull(seedWeight); + } + + @Override + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + if (seedWeight != null) { + return super.rewrite(indexSearcher); + } + BooleanQuery.Builder booleanSeedQueryBuilder = + new BooleanQuery.Builder() + .add(seed, BooleanClause.Occur.MUST) + .add(new FieldExistsQuery(field), BooleanClause.Occur.FILTER); + if (filter != null) { + booleanSeedQueryBuilder.add(filter, BooleanClause.Occur.FILTER); + } + Query seedRewritten = indexSearcher.rewrite(booleanSeedQueryBuilder.build()); + Weight seedWeight = indexSearcher.createWeight(seedRewritten, ScoreMode.TOP_SCORES, 1f); + SeededKnnByteVectorQuery rewritten = + new SeededKnnByteVectorQuery(field, target, k, filter, seedWeight); + return rewritten.rewrite(indexSearcher); + } + + @Override + protected KnnCollectorManager getKnnCollectorManager(int k, IndexSearcher searcher) { + if (seedWeight == null) { + throw new UnsupportedOperationException("must be rewritten before constructing manager"); + } + return new SeededKnnCollectorManager( + super.getKnnCollectorManager(k, searcher), + seedWeight, + k, + leaf -> { + ByteVectorValues vv = leaf.getByteVectorValues(field); + if (vv == null) { + ByteVectorValues.checkField(leaf.getContext().reader(), field); + } + return vv; + }); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/SeededKnnFloatVectorQuery.java b/lucene/core/src/java/org/apache/lucene/search/SeededKnnFloatVectorQuery.java new file mode 100644 index 000000000000..02a33bdcdef7 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/SeededKnnFloatVectorQuery.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Objects; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.search.knn.KnnCollectorManager; +import org.apache.lucene.search.knn.SeededKnnCollectorManager; + +/** + * This is a version of knn float vector query that provides a query seed to initiate the vector + * search. NOTE: The underlying format is free to ignore the provided seed. + * + *

    See "Lexically-Accelerated Dense + * Retrieval" (Kulkarni, Hrishikesh and MacAvaney, Sean and Goharian, Nazli and Frieder, Ophir). + * In SIGIR '23: Proceedings of the 46th International ACM SIGIR Conference on Research and + * Development in Information Retrieval Pages 152 - 162 + * + * @lucene.experimental + */ +public class SeededKnnFloatVectorQuery extends KnnFloatVectorQuery { + final Query seed; + final Weight seedWeight; + + /** + * Construct a new SeededKnnFloatVectorQuery instance + * + * @param field knn float vector field to query + * @param target the query vector + * @param k number of neighbors to return + * @param filter a filter on the neighbors to return + * @param seed a query seed to initiate the vector format search + */ + public SeededKnnFloatVectorQuery(String field, float[] target, int k, Query filter, Query seed) { + super(field, target, k, filter); + this.seed = Objects.requireNonNull(seed); + this.seedWeight = null; + } + + SeededKnnFloatVectorQuery(String field, float[] target, int k, Query filter, Weight seedWeight) { + super(field, target, k, filter); + this.seed = null; + this.seedWeight = Objects.requireNonNull(seedWeight); + } + + @Override + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + if (seedWeight != null) { + return super.rewrite(indexSearcher); + } + BooleanQuery.Builder booleanSeedQueryBuilder = + new BooleanQuery.Builder() + .add(seed, BooleanClause.Occur.MUST) + .add(new FieldExistsQuery(field), BooleanClause.Occur.FILTER); + if (filter != null) { + booleanSeedQueryBuilder.add(filter, BooleanClause.Occur.FILTER); + } + Query seedRewritten = indexSearcher.rewrite(booleanSeedQueryBuilder.build()); + Weight seedWeight = indexSearcher.createWeight(seedRewritten, ScoreMode.TOP_SCORES, 1f); + SeededKnnFloatVectorQuery rewritten = + new SeededKnnFloatVectorQuery(field, target, k, filter, seedWeight); + return rewritten.rewrite(indexSearcher); + } + + @Override + protected KnnCollectorManager getKnnCollectorManager(int k, IndexSearcher searcher) { + if (seedWeight == null) { + throw new UnsupportedOperationException("must be rewritten before constructing manager"); + } + return new SeededKnnCollectorManager( + super.getKnnCollectorManager(k, searcher), + seedWeight, + k, + leaf -> { + FloatVectorValues vv = leaf.getFloatVectorValues(field); + if (vv == null) { + FloatVectorValues.checkField(leaf.getContext().reader(), field); + } + return vv; + }); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java index 2a1f312fbc58..2dc2f035b90f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/TimeLimitingKnnCollectorManager.java @@ -45,51 +45,19 @@ public KnnCollector newCollector(int visitedLimit, LeafReaderContext context) th return new TimeLimitingKnnCollector(collector); } - class TimeLimitingKnnCollector implements KnnCollector { - private final KnnCollector collector; - - TimeLimitingKnnCollector(KnnCollector collector) { - this.collector = collector; + class TimeLimitingKnnCollector extends KnnCollector.Decorator { + public TimeLimitingKnnCollector(KnnCollector collector) { + super(collector); } @Override public boolean earlyTerminated() { - return queryTimeout.shouldExit() || collector.earlyTerminated(); - } - - @Override - public void incVisitedCount(int count) { - collector.incVisitedCount(count); - } - - @Override - public long visitedCount() { - return collector.visitedCount(); - } - - @Override - public long visitLimit() { - return collector.visitLimit(); - } - - @Override - public int k() { - return collector.k(); - } - - @Override - public boolean collect(int docId, float similarity) { - return collector.collect(docId, similarity); - } - - @Override - public float minCompetitiveSimilarity() { - return collector.minCompetitiveSimilarity(); + return queryTimeout.shouldExit() || super.earlyTerminated(); } @Override public TopDocs topDocs() { - TopDocs docs = collector.topDocs(); + TopDocs docs = super.topDocs(); // Mark results as partial if timeout is met TotalHits.Relation relation = diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/EntryPointProvider.java b/lucene/core/src/java/org/apache/lucene/search/knn/EntryPointProvider.java new file mode 100644 index 000000000000..9e7b44b571df --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/knn/EntryPointProvider.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.knn; + +import org.apache.lucene.search.DocIdSetIterator; + +/** Provides entry points for the kNN search */ +public interface EntryPointProvider { + /** Iterator of valid entry points for the kNN search */ + DocIdSetIterator entryPoints(); + + /** Number of valid entry points for the kNN search */ + int numberOfEntryPoints(); +} diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollector.java b/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollector.java new file mode 100644 index 000000000000..c3c4f62901ee --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollector.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.knn; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.KnnCollector; + +/** + * A {@link KnnCollector} that provides seeded knn collection. See usage in {@link + * SeededKnnCollectorManager}. + * + * @lucene.experimental + */ +class SeededKnnCollector extends KnnCollector.Decorator implements EntryPointProvider { + private final DocIdSetIterator entryPoints; + private final int numberOfEntryPoints; + + SeededKnnCollector( + KnnCollector collector, DocIdSetIterator entryPoints, int numberOfEntryPoints) { + super(collector); + this.entryPoints = entryPoints; + this.numberOfEntryPoints = numberOfEntryPoints; + } + + @Override + public DocIdSetIterator entryPoints() { + return entryPoints; + } + + @Override + public int numberOfEntryPoints() { + return numberOfEntryPoints; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollectorManager.java new file mode 100644 index 000000000000..7631db6e3022 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/knn/SeededKnnCollectorManager.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.knn; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.index.KnnVectorValues; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.BulkScorer; +import org.apache.lucene.search.CollectionTerminatedException; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.TopScoreDocCollectorManager; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.IOFunction; + +/** + * A {@link KnnCollectorManager} that provides seeded knn collection. See usage in {@link + * org.apache.lucene.search.SeededKnnFloatVectorQuery} and {@link + * org.apache.lucene.search.SeededKnnByteVectorQuery}. + */ +public class SeededKnnCollectorManager implements KnnCollectorManager { + private final KnnCollectorManager delegate; + private final Weight seedWeight; + private final int k; + private final IOFunction vectorValuesSupplier; + + public SeededKnnCollectorManager( + KnnCollectorManager delegate, + Weight seedWeight, + int k, + IOFunction vectorValuesSupplier) { + this.delegate = delegate; + this.seedWeight = seedWeight; + this.k = k; + this.vectorValuesSupplier = vectorValuesSupplier; + } + + @Override + public KnnCollector newCollector(int visitedLimit, LeafReaderContext ctx) throws IOException { + // Execute the seed query + TopScoreDocCollector seedCollector = + new TopScoreDocCollectorManager(k, null, Integer.MAX_VALUE).newCollector(); + final LeafReader leafReader = ctx.reader(); + final LeafCollector leafCollector = seedCollector.getLeafCollector(ctx); + if (leafCollector != null) { + try { + BulkScorer scorer = seedWeight.bulkScorer(ctx); + if (scorer != null) { + scorer.score( + leafCollector, + leafReader.getLiveDocs(), + 0 /* min */, + DocIdSetIterator.NO_MORE_DOCS /* max */); + } + } catch ( + @SuppressWarnings("unused") + CollectionTerminatedException e) { + } + leafCollector.finish(); + } + + TopDocs seedTopDocs = seedCollector.topDocs(); + KnnVectorValues vectorValues = vectorValuesSupplier.apply(leafReader); + final KnnCollector delegateCollector = delegate.newCollector(visitedLimit, ctx); + if (seedTopDocs.totalHits.value() == 0 || vectorValues == null) { + return delegateCollector; + } + KnnVectorValues.DocIndexIterator indexIterator = vectorValues.iterator(); + DocIdSetIterator seedDocs = new MappedDISI(indexIterator, new TopDocsDISI(seedTopDocs)); + return new SeededKnnCollector(delegateCollector, seedDocs, seedTopDocs.scoreDocs.length); + } + + private static class MappedDISI extends DocIdSetIterator { + KnnVectorValues.DocIndexIterator indexedDISI; + DocIdSetIterator sourceDISI; + + private MappedDISI(KnnVectorValues.DocIndexIterator indexedDISI, DocIdSetIterator sourceDISI) { + this.indexedDISI = indexedDISI; + this.sourceDISI = sourceDISI; + } + + /** + * Advances the source iterator to the first document number that is greater than or equal to + * the provided target and returns the corresponding index. + */ + @Override + public int advance(int target) throws IOException { + int newTarget = sourceDISI.advance(target); + if (newTarget != NO_MORE_DOCS) { + indexedDISI.advance(newTarget); + } + return docID(); + } + + @Override + public long cost() { + return sourceDISI.cost(); + } + + @Override + public int docID() { + if (indexedDISI.docID() == NO_MORE_DOCS || sourceDISI.docID() == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + return indexedDISI.index(); + } + + /** Advances to the next document in the source iterator and returns the corresponding index. */ + @Override + public int nextDoc() throws IOException { + int newTarget = sourceDISI.nextDoc(); + if (newTarget != NO_MORE_DOCS) { + indexedDISI.advance(newTarget); + } + return docID(); + } + } + + private static class TopDocsDISI extends DocIdSetIterator { + private final int[] sortedDocIds; + private int idx = -1; + + private TopDocsDISI(TopDocs topDocs) { + sortedDocIds = new int[topDocs.scoreDocs.length]; + for (int i = 0; i < topDocs.scoreDocs.length; i++) { + sortedDocIds[i] = topDocs.scoreDocs[i].doc; + } + Arrays.sort(sortedDocIds); + } + + @Override + public int advance(int target) throws IOException { + return slowAdvance(target); + } + + @Override + public long cost() { + return sortedDocIds.length; + } + + @Override + public int docID() { + if (idx == -1) { + return -1; + } else if (idx >= sortedDocIds.length) { + return DocIdSetIterator.NO_MORE_DOCS; + } else { + return sortedDocIds[idx]; + } + } + + @Override + public int nextDoc() { + idx += 1; + return docID(); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java index 46d6c93d52c3..e8f0d316fd81 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java @@ -20,8 +20,10 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.TopKnnCollector; +import org.apache.lucene.search.knn.EntryPointProvider; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; @@ -52,7 +54,9 @@ public HnswGraphSearcher(NeighborQueue candidates, BitSet visited) { } /** - * Searches HNSW graph for the nearest neighbors of a query vector. + * Searches the HNSW graph for the nearest neighbors of a query vector. If entry points are + * directly provided via the knnCollector, then the search will be initialized at those points. + * Otherwise, the search will discover the best entry point per the normal HNSW search algorithm. * * @param scorer the scorer to compare the query with the nodes * @param knnCollector a collector of top knn results to be returned @@ -67,7 +71,30 @@ public static void search( HnswGraphSearcher graphSearcher = new HnswGraphSearcher( new NeighborQueue(knnCollector.k(), true), new SparseFixedBitSet(getGraphSize(graph))); - search(scorer, knnCollector, graph, graphSearcher, acceptOrds); + final int[] entryPoints; + if (knnCollector instanceof EntryPointProvider epp) { + if (epp.numberOfEntryPoints() <= 0) { + throw new IllegalArgumentException("The number of entry points must be > 0"); + } + DocIdSetIterator eps = epp.entryPoints(); + entryPoints = new int[epp.numberOfEntryPoints()]; + int idx = 0; + while (idx < entryPoints.length) { + int entryPointOrdInt = eps.nextDoc(); + if (entryPointOrdInt == NO_MORE_DOCS) { + throw new IllegalArgumentException( + "The number of entry points provided is less than the number of entry points requested"); + } + assert entryPointOrdInt < getGraphSize(graph); + entryPoints[idx++] = entryPointOrdInt; + } + // This is an invalid case, but we should check it + assert entryPoints.length > 0; + // We use provided entry point ordinals to search the complete graph (level 0) + graphSearcher.searchLevel(knnCollector, scorer, 0, entryPoints, graph, acceptOrds); + } else { + search(scorer, knnCollector, graph, graphSearcher, acceptOrds); + } } /** diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java index ed1a5ffb59fa..5225fe700ab9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/OrdinalTranslatedKnnCollector.java @@ -24,54 +24,24 @@ /** * Wraps a provided KnnCollector object, translating the provided vectorId ordinal to a documentId */ -public final class OrdinalTranslatedKnnCollector implements KnnCollector { +public final class OrdinalTranslatedKnnCollector extends KnnCollector.Decorator { - private final KnnCollector in; private final IntToIntFunction vectorOrdinalToDocId; - public OrdinalTranslatedKnnCollector(KnnCollector in, IntToIntFunction vectorOrdinalToDocId) { - this.in = in; + public OrdinalTranslatedKnnCollector( + KnnCollector collector, IntToIntFunction vectorOrdinalToDocId) { + super(collector); this.vectorOrdinalToDocId = vectorOrdinalToDocId; } - @Override - public boolean earlyTerminated() { - return in.earlyTerminated(); - } - - @Override - public void incVisitedCount(int count) { - in.incVisitedCount(count); - } - - @Override - public long visitedCount() { - return in.visitedCount(); - } - - @Override - public long visitLimit() { - return in.visitLimit(); - } - - @Override - public int k() { - return in.k(); - } - @Override public boolean collect(int vectorId, float similarity) { - return in.collect(vectorOrdinalToDocId.apply(vectorId), similarity); - } - - @Override - public float minCompetitiveSimilarity() { - return in.minCompetitiveSimilarity(); + return super.collect(vectorOrdinalToDocId.apply(vectorId), similarity); } @Override public TopDocs topDocs() { - TopDocs td = in.topDocs(); + TopDocs td = super.topDocs(); return new TopDocs( new TotalHits( visitedCount(), diff --git a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java index 2023ee73391d..1e485515a62b 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java @@ -17,6 +17,7 @@ package org.apache.lucene.document; import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; +import java.nio.file.Path; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -24,19 +25,27 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.SeededKnnFloatVectorQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.tests.codecs.vector.ConfigurableMCodec; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.LuceneTestCase.Monster; +import org.junit.BeforeClass; @TimeoutSuite(millis = 86_400_000) // 24 hour timeout @Monster("takes ~10 minutes and needs extra heap, disk space, file handles") public class TestManyKnnDocs extends LuceneTestCase { // gradlew -p lucene/core test --tests TestManyKnnDocs -Ptests.heapsize=16g -Dtests.monster=true - public void testLargeSegment() throws Exception { + private static Path testDir; + + @BeforeClass + public static void init_index() throws Exception { IndexWriterConfig iwc = new IndexWriterConfig(); iwc.setCodec( new ConfigurableMCodec( @@ -46,27 +55,138 @@ public void testLargeSegment() throws Exception { mp.setMaxMergeAtOnce(256); // avoid intermediate merges (waste of time with HNSW?) mp.setSegmentsPerTier(256); // only merge once at the end when we ask iwc.setMergePolicy(mp); - String fieldName = "field"; VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; - try (Directory dir = FSDirectory.open(createTempDir("ManyKnnVectorDocs")); + try (Directory dir = FSDirectory.open(testDir = createTempDir("ManyKnnVectorDocs")); IndexWriter iw = new IndexWriter(dir, iwc)) { int numVectors = 2088992; - float[] vector = new float[1]; - Document doc = new Document(); - doc.add(new KnnFloatVectorField(fieldName, vector, similarityFunction)); for (int i = 0; i < numVectors; i++) { + float[] vector = new float[1]; + Document doc = new Document(); vector[0] = (i % 256); + doc.add(new KnnFloatVectorField("field", vector, similarityFunction)); + doc.add(new KeywordField("int", "" + i, org.apache.lucene.document.Field.Store.YES)); + doc.add(new StoredField("intValue", i)); iw.addDocument(doc); } // merge to single segment and then verify iw.forceMerge(1); iw.commit(); + } + } + + public void testLargeSegmentKnn() throws Exception { + try (Directory dir = FSDirectory.open(testDir)) { IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); - TopDocs docs = searcher.search(new KnnFloatVectorQuery("field", new float[] {120}, 10), 5); - assertEquals(5, docs.scoreDocs.length); + for (int i = 0; i < 256; i++) { + Query filterQuery = new MatchAllDocsQuery(); + float[] vector = new float[128]; + vector[0] = i; + vector[1] = 1; + TopDocs docs = + searcher.search(new KnnFloatVectorQuery("field", vector, 10, filterQuery), 5); + assertEquals(5, docs.scoreDocs.length); + Document d = searcher.storedFields().document(docs.scoreDocs[0].doc); + String s = ""; + for (int j = 0; j < docs.scoreDocs.length - 1; j++) { + s += docs.scoreDocs[j].doc + " " + docs.scoreDocs[j].score + "\n"; + } + assertEquals(s, i + 256, d.getField("intValue").numericValue()); + } + } + } + + public void testLargeSegmentSeededExact() throws Exception { + try (Directory dir = FSDirectory.open(testDir)) { + IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); + for (int i = 0; i < 256; i++) { + Query seedQuery = KeywordField.newExactQuery("int", "" + (i + 256)); + Query filterQuery = new MatchAllDocsQuery(); + float[] vector = new float[128]; + vector[0] = i; + vector[1] = 1; + TopDocs docs = + searcher.search( + new SeededKnnFloatVectorQuery("field", vector, 10, filterQuery, seedQuery), 5); + assertEquals(5, docs.scoreDocs.length); + String s = ""; + for (int j = 0; j < docs.scoreDocs.length - 1; j++) { + s += docs.scoreDocs[j].doc + " " + docs.scoreDocs[j].score + "\n"; + } + Document d = searcher.storedFields().document(docs.scoreDocs[0].doc); + assertEquals(s, i + 256, d.getField("intValue").numericValue()); + } + } + } + + public void testLargeSegmentSeededNearby() throws Exception { + try (Directory dir = FSDirectory.open(testDir)) { + IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); + for (int i = 0; i < 256; i++) { + Query seedQuery = KeywordField.newExactQuery("int", "" + i); + Query filterQuery = new MatchAllDocsQuery(); + float[] vector = new float[128]; + vector[0] = i; + vector[1] = 1; + TopDocs docs = + searcher.search( + new SeededKnnFloatVectorQuery("field", vector, 10, filterQuery, seedQuery), 5); + assertEquals(5, docs.scoreDocs.length); + String s = ""; + for (int j = 0; j < docs.scoreDocs.length - 1; j++) { + s += docs.scoreDocs[j].doc + " " + docs.scoreDocs[j].score + "\n"; + } + Document d = searcher.storedFields().document(docs.scoreDocs[0].doc); + assertEquals(s, i + 256, d.getField("intValue").numericValue()); + } + } + } + + public void testLargeSegmentSeededDistant() throws Exception { + try (Directory dir = FSDirectory.open(testDir)) { + IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); + for (int i = 0; i < 256; i++) { + Query seedQuery = KeywordField.newExactQuery("int", "" + (i + 128)); + Query filterQuery = new MatchAllDocsQuery(); + float[] vector = new float[128]; + vector[0] = i; + vector[1] = 1; + TopDocs docs = + searcher.search( + new SeededKnnFloatVectorQuery("field", vector, 10, filterQuery, seedQuery), 5); + assertEquals(5, docs.scoreDocs.length); + Document d = searcher.storedFields().document(docs.scoreDocs[0].doc); + String s = ""; + for (int j = 0; j < docs.scoreDocs.length - 1; j++) { + s += docs.scoreDocs[j].doc + " " + docs.scoreDocs[j].score + "\n"; + } + assertEquals(s, i + 256, d.getField("intValue").numericValue()); + } + } + } + + public void testLargeSegmentSeededNone() throws Exception { + try (Directory dir = FSDirectory.open(testDir)) { + IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); + for (int i = 0; i < 256; i++) { + Query seedQuery = new MatchNoDocsQuery(); + Query filterQuery = new MatchAllDocsQuery(); + float[] vector = new float[128]; + vector[0] = i; + vector[1] = 1; + TopDocs docs = + searcher.search( + new SeededKnnFloatVectorQuery("field", vector, 10, filterQuery, seedQuery), 5); + assertEquals(5, docs.scoreDocs.length); + Document d = searcher.storedFields().document(docs.scoreDocs[0].doc); + String s = ""; + for (int j = 0; j < docs.scoreDocs.length - 1; j++) { + s += docs.scoreDocs[j].doc + " " + docs.scoreDocs[j].score + "\n"; + } + assertEquals(s, i + 256, d.getField("intValue").numericValue()); + } } } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java index b45d6e8fb641..21219e0e1d99 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQuery.java @@ -61,7 +61,7 @@ Field getKnnVectorField(String name, float[] vector) { return new KnnByteVectorField(name, floatToBytes(vector), VectorSimilarityFunction.EUCLIDEAN); } - private static byte[] floatToBytes(float[] query) { + static byte[] floatToBytes(float[] query) { byte[] bytes = new byte[query.length]; for (int i = 0; i < query.length; i++) { assert query[i] <= Byte.MAX_VALUE && query[i] >= Byte.MIN_VALUE && (query[i] % 1) == 0 @@ -109,7 +109,7 @@ public void testVectorEncodingMismatch() throws IOException { } } - private static class ThrowingKnnVectorQuery extends KnnByteVectorQuery { + static class ThrowingKnnVectorQuery extends KnnByteVectorQuery { public ThrowingKnnVectorQuery(String field, byte[] target, int k, Query filter) { super(field, target, k, filter); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java index 5dcb6f97df93..ece2b385654e 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java @@ -259,7 +259,7 @@ public void testDocAndScoreQueryBasics() throws IOException { } } - private static class ThrowingKnnVectorQuery extends KnnFloatVectorQuery { + static class ThrowingKnnVectorQuery extends KnnFloatVectorQuery { public ThrowingKnnVectorQuery(String field, float[] target, int k, Query filter) { super(field, target, k, filter); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnByteVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnByteVectorQuery.java new file mode 100644 index 000000000000..d0fb8c95e035 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnByteVectorQuery.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import static org.apache.lucene.search.TestKnnByteVectorQuery.floatToBytes; + +import java.io.IOException; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.KnnByteVectorField; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.QueryTimeout; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.TestVectorUtil; + +public class TestSeededKnnByteVectorQuery extends BaseKnnVectorQueryTestCase { + + private static final Query MATCH_NONE = new MatchNoDocsQuery(); + + @Override + AbstractKnnVectorQuery getKnnVectorQuery(String field, float[] query, int k, Query queryFilter) { + return new SeededKnnByteVectorQuery(field, floatToBytes(query), k, queryFilter, MATCH_NONE); + } + + @Override + AbstractKnnVectorQuery getThrowingKnnVectorQuery(String field, float[] vec, int k, Query query) { + return new ThrowingKnnVectorQuery(field, floatToBytes(vec), k, query, MATCH_NONE); + } + + @Override + float[] randomVector(int dim) { + byte[] b = TestVectorUtil.randomVectorBytes(dim); + float[] v = new float[b.length]; + int vi = 0; + for (int i = 0; i < v.length; i++) { + v[vi++] = b[i]; + } + return v; + } + + @Override + Field getKnnVectorField( + String name, float[] vector, VectorSimilarityFunction similarityFunction) { + return new KnnByteVectorField(name, floatToBytes(vector), similarityFunction); + } + + @Override + Field getKnnVectorField(String name, float[] vector) { + return new KnnByteVectorField(name, floatToBytes(vector), VectorSimilarityFunction.EUCLIDEAN); + } + + /** Tests with random vectors and a random seed. Uses RandomIndexWriter. */ + public void testRandomWithSeed() throws IOException { + int numDocs = 1000; + int dimension = atLeast(5); + int numIters = atLeast(10); + int numDocsWithVector = 0; + try (Directory d = newDirectoryForTest()) { + // Always use the default kNN format to have predictable behavior around when it hits + // visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN + // format + // implementation. + IndexWriterConfig iwc = new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()); + RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { + // Randomly skip some vectors to test the mapping from docid to ordinals + doc.add(getKnnVectorField("field", randomVector(dimension))); + numDocsWithVector += 1; + } + doc.add(new NumericDocValuesField("tag", i)); + doc.add(new IntPoint("tag", i)); + w.addDocument(doc); + } + w.forceMerge(1); + w.close(); + + try (IndexReader reader = DirectoryReader.open(d)) { + IndexSearcher searcher = newSearcher(reader); + for (int i = 0; i < numIters; i++) { + int k = random().nextInt(80) + 1; + int n = random().nextInt(100) + 1; + // we may get fewer results than requested if there are deletions, but this test doesn't + // check that + assert reader.hasDeletions() == false; + + // All documents as seeds + Query seed1 = new MatchAllDocsQuery(); + Query filter = random().nextBoolean() ? null : new MatchAllDocsQuery(); + SeededKnnByteVectorQuery query = + new SeededKnnByteVectorQuery( + "field", floatToBytes(randomVector(dimension)), k, filter, seed1); + TopDocs results = searcher.search(query, n); + int expected = Math.min(Math.min(n, k), numDocsWithVector); + + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + float last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + + // Restrictive seed query -- 6 documents + Query seed2 = IntPoint.newRangeQuery("tag", 1, 6); + query = + new SeededKnnByteVectorQuery( + "field", floatToBytes(randomVector(dimension)), k, null, seed2); + results = searcher.search(query, n); + expected = Math.min(Math.min(n, k), reader.numDocs()); + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + + // No seed documents -- falls back on full approx search + Query seed3 = new MatchNoDocsQuery(); + query = + new SeededKnnByteVectorQuery( + "field", floatToBytes(randomVector(dimension)), k, null, seed3); + results = searcher.search(query, n); + expected = Math.min(Math.min(n, k), reader.numDocs()); + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + } + } + } + } + + private static class ThrowingKnnVectorQuery extends SeededKnnByteVectorQuery { + + public ThrowingKnnVectorQuery(String field, byte[] target, int k, Query filter, Query seed) { + super(field, target, k, filter, seed); + } + + private ThrowingKnnVectorQuery( + String field, byte[] target, int k, Query filter, Weight seedWeight) { + super(field, target, k, filter, seedWeight); + } + + @Override + // This is test only and we need to overwrite the inner rewrite to throw + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + if (seedWeight != null) { + return super.rewrite(indexSearcher); + } + BooleanQuery.Builder booleanSeedQueryBuilder = + new BooleanQuery.Builder() + .add(seed, BooleanClause.Occur.MUST) + .add(new FieldExistsQuery(field), BooleanClause.Occur.FILTER); + if (filter != null) { + booleanSeedQueryBuilder.add(filter, BooleanClause.Occur.FILTER); + } + Query seedRewritten = indexSearcher.rewrite(booleanSeedQueryBuilder.build()); + Weight seedWeight = indexSearcher.createWeight(seedRewritten, ScoreMode.TOP_SCORES, 1f); + return new ThrowingKnnVectorQuery(field, target, k, filter, seedWeight) + .rewrite(indexSearcher); + } + + @Override + protected TopDocs exactSearch( + LeafReaderContext context, DocIdSetIterator acceptIterator, QueryTimeout queryTimeout) { + throw new UnsupportedOperationException("exact search is not supported"); + } + + @Override + public String toString(String field) { + return null; + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnFloatVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnFloatVectorQuery.java new file mode 100644 index 000000000000..d5630037ef74 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestSeededKnnFloatVectorQuery.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.QueryTimeout; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.TestVectorUtil; + +public class TestSeededKnnFloatVectorQuery extends BaseKnnVectorQueryTestCase { + private static final Query MATCH_NONE = new MatchNoDocsQuery(); + + @Override + KnnFloatVectorQuery getKnnVectorQuery(String field, float[] query, int k, Query queryFilter) { + return new SeededKnnFloatVectorQuery(field, query, k, queryFilter, MATCH_NONE); + } + + @Override + AbstractKnnVectorQuery getThrowingKnnVectorQuery(String field, float[] vec, int k, Query query) { + return new ThrowingKnnVectorQuery(field, vec, k, query, MATCH_NONE); + } + + @Override + float[] randomVector(int dim) { + return TestVectorUtil.randomVector(dim); + } + + @Override + Field getKnnVectorField( + String name, float[] vector, VectorSimilarityFunction similarityFunction) { + return new KnnFloatVectorField(name, vector, similarityFunction); + } + + @Override + Field getKnnVectorField(String name, float[] vector) { + return new KnnFloatVectorField(name, vector); + } + + /** Tests with random vectors and a random seed. Uses RandomIndexWriter. */ + public void testRandomWithSeed() throws IOException { + int numDocs = 1000; + int dimension = atLeast(5); + int numIters = atLeast(10); + int numDocsWithVector = 0; + try (Directory d = newDirectoryForTest()) { + // Always use the default kNN format to have predictable behavior around when it hits + // visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN + // format + // implementation. + IndexWriterConfig iwc = new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec()); + RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextBoolean()) { + // Randomly skip some vectors to test the mapping from docid to ordinals + doc.add(getKnnVectorField("field", randomVector(dimension))); + numDocsWithVector += 1; + } + doc.add(new NumericDocValuesField("tag", i)); + doc.add(new IntPoint("tag", i)); + w.addDocument(doc); + } + w.forceMerge(1); + w.close(); + + try (IndexReader reader = DirectoryReader.open(d)) { + IndexSearcher searcher = newSearcher(reader); + for (int i = 0; i < numIters; i++) { + int k = random().nextInt(80) + 1; + int n = random().nextInt(100) + 1; + // we may get fewer results than requested if there are deletions, but this test doesn't + // check that + assert reader.hasDeletions() == false; + + // All documents as seeds + Query seed1 = new MatchAllDocsQuery(); + Query filter = random().nextBoolean() ? null : new MatchAllDocsQuery(); + AbstractKnnVectorQuery query = + new SeededKnnFloatVectorQuery("field", randomVector(dimension), k, filter, seed1); + TopDocs results = searcher.search(query, n); + int expected = Math.min(Math.min(n, k), numDocsWithVector); + + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + float last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + + // Restrictive seed query -- 6 documents + Query seed2 = IntPoint.newRangeQuery("tag", 1, 6); + query = new SeededKnnFloatVectorQuery("field", randomVector(dimension), k, null, seed2); + results = searcher.search(query, n); + expected = Math.min(Math.min(n, k), reader.numDocs()); + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + + // No seed documents -- falls back on full approx search + Query seed3 = new MatchNoDocsQuery(); + query = new SeededKnnFloatVectorQuery("field", randomVector(dimension), k, null, seed3); + results = searcher.search(query, n); + expected = Math.min(Math.min(n, k), reader.numDocs()); + assertEquals(expected, results.scoreDocs.length); + assertTrue(results.totalHits.value() >= results.scoreDocs.length); + // verify the results are in descending score order + last = Float.MAX_VALUE; + for (ScoreDoc scoreDoc : results.scoreDocs) { + assertTrue(scoreDoc.score <= last); + last = scoreDoc.score; + } + } + } + } + } + + private static class ThrowingKnnVectorQuery extends SeededKnnFloatVectorQuery { + + private ThrowingKnnVectorQuery(String field, float[] target, int k, Query filter, Query seed) { + super(field, target, k, filter, seed); + } + + private ThrowingKnnVectorQuery( + String field, float[] target, int k, Query filter, Weight seedWeight) { + super(field, target, k, filter, seedWeight); + } + + @Override + // This is test only and we need to overwrite the inner rewrite to throw + public Query rewrite(IndexSearcher indexSearcher) throws IOException { + if (seedWeight != null) { + return super.rewrite(indexSearcher); + } + BooleanQuery.Builder booleanSeedQueryBuilder = + new BooleanQuery.Builder() + .add(seed, BooleanClause.Occur.MUST) + .add(new FieldExistsQuery(field), BooleanClause.Occur.FILTER); + if (filter != null) { + booleanSeedQueryBuilder.add(filter, BooleanClause.Occur.FILTER); + } + Query seedRewritten = indexSearcher.rewrite(booleanSeedQueryBuilder.build()); + Weight seedWeight = indexSearcher.createWeight(seedRewritten, ScoreMode.TOP_SCORES, 1f); + return new ThrowingKnnVectorQuery(field, target, k, filter, seedWeight) + .rewrite(indexSearcher); + } + + @Override + protected TopDocs exactSearch( + LeafReaderContext context, DocIdSetIterator acceptIterator, QueryTimeout queryTimeout) { + throw new UnsupportedOperationException("exact search is not supported"); + } + + @Override + public String toString(String field) { + return null; + } + } +} From df7170efd17143a3e27f1c7ae87eafb85784c735 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Wed, 15 Jan 2025 17:58:49 +0100 Subject: [PATCH 27/88] Temporarily skip tasks that point at datasets previously hosted at home.apache.org #13647 #14144 --- gradle/datasets/external-datasets.gradle | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/gradle/datasets/external-datasets.gradle b/gradle/datasets/external-datasets.gradle index 44fd38117bb3..155382e4f463 100644 --- a/gradle/datasets/external-datasets.gradle +++ b/gradle/datasets/external-datasets.gradle @@ -52,6 +52,9 @@ configure(project(":lucene:benchmark")) { dst = file("${dataDir}/${name}") } + // TODO: dataset gone. https://github.com/apache/lucene/issues/13647 + onlyIf { false } + outputs.file ext.dst src ext.src @@ -73,6 +76,9 @@ configure(project(":lucene:benchmark")) { dst = file("${dataDir}/${name}") } + // TODO: dataset gone. https://github.com/apache/lucene/issues/13647 + onlyIf { false } + outputs.file ext.dst src ext.src @@ -99,6 +105,9 @@ configure(project(":lucene:benchmark")) { outputs.file ext.dst + // TODO: dataset gone. https://github.com/apache/lucene/issues/13647 + onlyIf { false } + src ext.src dest ext.intermediate overwrite false @@ -118,6 +127,9 @@ configure(project(":lucene:benchmark")) { dst = file("${dataDir}/${name}") } + // TODO: dataset gone. https://github.com/apache/lucene/issues/13647 + onlyIf { false } + outputs.dir ext.dst src ext.src From 16cd779e9dc982ddea19f787bc294a49fb5b2927 Mon Sep 17 00:00:00 2001 From: Clay Johnson Date: Wed, 15 Jan 2025 11:55:43 -0600 Subject: [PATCH 28/88] Publish build scans to develocity.apache.org (#14141) --- .github/workflows/run-checks-all.yml | 2 +- .github/workflows/run-checks-gradle-upgrade.yml | 2 +- .github/workflows/run-checks-mod-analysis-common.yml | 2 +- .../workflows/run-checks-mod-distribution.tests.yml | 2 +- README.md | 2 +- gradle/ge.gradle | 12 ++++++------ settings.gradle | 4 ++-- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/run-checks-all.yml b/.github/workflows/run-checks-all.yml index 18dd308e9a77..fdf23e4c3460 100644 --- a/.github/workflows/run-checks-all.yml +++ b/.github/workflows/run-checks-all.yml @@ -13,7 +13,7 @@ on: - 'branch_10x' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} # We split the workflow into two parallel jobs for efficiency: # one is running all validation checks without tests, diff --git a/.github/workflows/run-checks-gradle-upgrade.yml b/.github/workflows/run-checks-gradle-upgrade.yml index 07b7210cf4e2..b026ce96bba2 100644 --- a/.github/workflows/run-checks-gradle-upgrade.yml +++ b/.github/workflows/run-checks-gradle-upgrade.yml @@ -20,7 +20,7 @@ on: - 'gradle/wrapper/**' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} jobs: gradleSanityCheck: diff --git a/.github/workflows/run-checks-mod-analysis-common.yml b/.github/workflows/run-checks-mod-analysis-common.yml index a208039a99fa..5f53263b3229 100644 --- a/.github/workflows/run-checks-mod-analysis-common.yml +++ b/.github/workflows/run-checks-mod-analysis-common.yml @@ -20,7 +20,7 @@ on: - 'lucene/analysis/common/**' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} jobs: test: diff --git a/.github/workflows/run-checks-mod-distribution.tests.yml b/.github/workflows/run-checks-mod-distribution.tests.yml index e3af5812c80c..b78db6dd9463 100644 --- a/.github/workflows/run-checks-mod-distribution.tests.yml +++ b/.github/workflows/run-checks-mod-distribution.tests.yml @@ -14,7 +14,7 @@ on: - 'branch_10x' env: - GRADLE_ENTERPRISE_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }} + DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} jobs: test: diff --git a/README.md b/README.md index c613a16986ea..c2c963ef50e3 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Apache Lucene is a high-performance, full-featured text search engine library written in Java. [![Build Status](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/badge/icon?subject=Lucene)](https://ci-builds.apache.org/job/Lucene/job/Lucene-Artifacts-main/) -[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://ge.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root) +[![Revved up by Develocity](https://img.shields.io/badge/Revved%20up%20by-Develocity-06A0CE?logo=Gradle&labelColor=02303A)](https://develocity.apache.org/scans?search.buildToolType=gradle&search.rootProjectNames=lucene-root) ## Online Documentation diff --git a/gradle/ge.gradle b/gradle/ge.gradle index f6bba24f23f5..c4677859e33a 100644 --- a/gradle/ge.gradle +++ b/gradle/ge.gradle @@ -17,13 +17,13 @@ def isCIBuild = System.getenv().keySet().find { it ==~ /(?i)((JENKINS|HUDSON)(_\w+)?|CI)/ } != null -gradleEnterprise { - server = "https://ge.apache.org" +develocity { + server = "https://develocity.apache.org" + projectId = "lucene" + buildScan { - capture { taskInputFiles = true } uploadInBackground = !isCIBuild - publishAlways() - publishIfAuthenticated() + publishing.onlyIf { it.isAuthenticated() } obfuscation { ipAddresses { addresses -> addresses.collect { address -> "0.0.0.0"} } } @@ -35,7 +35,7 @@ buildCache { enabled = !isCIBuild } - remote(gradleEnterprise.buildCache) { + remote(develocity.buildCache) { enabled = false } } diff --git a/settings.gradle b/settings.gradle index f4ee13243ca6..8543bab1619f 100644 --- a/settings.gradle +++ b/settings.gradle @@ -26,8 +26,8 @@ pluginManagement { plugins { id "org.gradle.toolchains.foojay-resolver-convention" version "0.8.0" - id 'com.gradle.enterprise' version '3.15.1' - id 'com.gradle.common-custom-user-data-gradle-plugin' version '1.11.3' + id 'com.gradle.develocity' version '3.18.2' + id 'com.gradle.common-custom-user-data-gradle-plugin' version '2.0.2' } dependencyResolutionManagement { From e4b85cab57602ddcd9c7e2e6647be9988621ebbe Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 16 Jan 2025 07:52:43 +0100 Subject: [PATCH 29/88] Implement IntersectVisitor#visit(IntsRef) whenever it makes sense (#14138) Implement IntersectVisitor#visit(IntsRef) in many of the current implementations and add BulkAdder#add(IntsRef) method. They should provide better performance due to less virtual method calls and more efficient bulk processing. --- lucene/CHANGES.txt | 4 ++ .../document/LatLonPointDistanceQuery.java | 14 ++++++ .../document/LongDistanceFeatureQuery.java | 16 +++++++ .../lucene/document/RangeFieldQuery.java | 10 +++- .../apache/lucene/document/SpatialQuery.java | 44 +++++++++++++++++ .../document/XYPointInGeometryQuery.java | 6 +++ .../apache/lucene/search/PointRangeQuery.java | 4 +- .../apache/lucene/util/DocIdSetBuilder.java | 48 +++++++++++-------- .../org/apache/lucene/util/bkd/BKDReader.java | 14 ++++++ .../lucene/util/TestDocIdSetBuilder.java | 19 +++++--- .../PointInShapeIntersectVisitor.java | 6 +++ 11 files changed, 154 insertions(+), 31 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5084c25f3560..c2b37a94f895 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -61,6 +61,10 @@ Improvements * GITHUB#14113: Remove unnecessary ByteArrayDataInput allocations from `Lucene90DocValuesProducer$TermsDict.decompressBlock`. (Ankit Jain) +* GITHUB#14138: Implement IntersectVisitor#visit(IntsRef) in many of the current implementations and add + BulkAdder#add(IntsRef) method. They should provide better performance due to less virtual method calls and + more efficient bulk processing. (Ignacio Vera) + Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java b/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java index 7f5f8cf6290c..9cebb8e73014 100644 --- a/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java @@ -44,6 +44,7 @@ import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.NumericUtils; /** Distance query for {@link LatLonPoint}. */ @@ -233,6 +234,11 @@ public void visit(int docID) { adder.add(docID); } + @Override + public void visit(IntsRef ref) { + adder.add(ref); + } + @Override public void visit(DocIdSetIterator iterator) throws IOException { adder.add(iterator); @@ -269,6 +275,14 @@ public void visit(int docID) { cost[0]--; } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + result.clear(ref.ints[ref.offset + i]); + } + cost[0] = -ref.length; + } + @Override public void visit(DocIdSetIterator iterator) throws IOException { result.andNot(iterator); diff --git a/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java b/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java index c675136ca80d..788ded4909ba 100644 --- a/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/LongDistanceFeatureQuery.java @@ -35,6 +35,7 @@ import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.Weight; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.NumericUtils; final class LongDistanceFeatureQuery extends Query { @@ -405,6 +406,21 @@ public void visit(int docID, byte[] packedValue) { adder.add(docID); } + @Override + public void visit(DocIdSetIterator iterator) throws IOException { + int docID; + while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + visit(docID); + } + } + + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; ++i) { + visit(ref.ints[ref.offset + i]); + } + } + @Override public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { long minDocValue = NumericUtils.sortableBytesToLong(minPackedValue, 0); diff --git a/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java b/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java index f5747c0f8bde..8248441f3cda 100644 --- a/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/RangeFieldQuery.java @@ -38,6 +38,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil.ByteArrayComparator; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.IntsRef; /** * Query class for searching {@code RangeField} types by a defined {@link Relation}. @@ -401,7 +402,12 @@ public void grow(int count) { } @Override - public void visit(int docID) throws IOException { + public void visit(IntsRef ref) { + adder.add(ref); + } + + @Override + public void visit(int docID) { adder.add(docID); } @@ -411,7 +417,7 @@ public void visit(DocIdSetIterator iterator) throws IOException { } @Override - public void visit(int docID, byte[] leaf) throws IOException { + public void visit(int docID, byte[] leaf) { if (queryType.matches(ranges, leaf, numDims, bytesPerDim, comparator)) { visit(docID); } diff --git a/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java b/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java index 811591d9a1cd..cc233f89948d 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java @@ -49,6 +49,7 @@ import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IntsRef; /** * Base query class for all spatial geometries: {@link LatLonShape}, {@link LatLonPoint} and {@link @@ -445,6 +446,11 @@ public void visit(DocIdSetIterator iterator) throws IOException { adder.add(iterator); } + @Override + public void visit(IntsRef ref) { + adder.add(ref); + } + @Override public void visit(int docID, byte[] t) { if (leafPredicate.test(t)) { @@ -489,6 +495,14 @@ public void visit(DocIdSetIterator iterator) throws IOException { cost[0] += iterator.cost(); } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + result.set(ref.ints[ref.offset + i]); + } + cost[0] += ref.length; + } + @Override public void visit(int docID, byte[] t) { if (result.get(docID) == false) { @@ -532,6 +546,14 @@ public void visit(int docID) { cost[0]++; } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + result.set(ref.ints[ref.offset + i]); + } + cost[0] += ref.length; + } + @Override public void visit(DocIdSetIterator iterator) throws IOException { result.or(iterator); @@ -589,6 +611,13 @@ public void visit(DocIdSetIterator iterator) throws IOException { excluded.or(iterator); } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + visit(ref.ints[ref.offset + i]); + } + } + @Override public void visit(int docID, byte[] t) { if (excluded.get(docID) == false) { @@ -643,6 +672,14 @@ public void visit(int docID) { cost[0]--; } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + result.clear(ref.ints[ref.offset + i]); + } + cost[0] -= ref.length; + } + @Override public void visit(DocIdSetIterator iterator) throws IOException { result.andNot(iterator); @@ -693,6 +730,13 @@ public void visit(DocIdSetIterator iterator) throws IOException { result.andNot(iterator); } + @Override + public void visit(IntsRef ref) { + for (int i = 0; i < ref.length; i++) { + visit(ref.ints[ref.offset + i]); + } + } + @Override public void visit(int docID, byte[] packedTriangle) { // NO-OP diff --git a/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java b/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java index 47b6abb46c22..833d9c9209c6 100644 --- a/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/XYPointInGeometryQuery.java @@ -38,6 +38,7 @@ import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.Weight; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.IntsRef; /** * Finds all previously indexed points that fall within the specified XY geometries. @@ -90,6 +91,11 @@ public void visit(DocIdSetIterator iterator) throws IOException { adder.add(iterator); } + @Override + public void visit(IntsRef ref) { + adder.add(ref); + } + @Override public void visit(int docID, byte[] packedValue) { double x = XYEncodingUtils.decode(packedValue, 0); diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java index 1b6d6869c19e..e5d956e8d1ee 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java @@ -188,9 +188,7 @@ public void visit(DocIdSetIterator iterator) throws IOException { @Override public void visit(IntsRef ref) { - for (int i = ref.offset; i < ref.offset + ref.length; i++) { - adder.add(ref.ints[i]); - } + adder.add(ref); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java index 28128af05f67..159cef025678 100644 --- a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java @@ -41,29 +41,28 @@ public final class DocIdSetBuilder { * * @see DocIdSetBuilder#grow */ - public abstract static class BulkAdder { - public abstract void add(int doc); + public sealed interface BulkAdder permits FixedBitSetAdder, BufferAdder { + void add(int doc); - public void add(DocIdSetIterator iterator) throws IOException { - int docID; - while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - add(docID); - } - } - } + void add(IntsRef docs); - private static class FixedBitSetAdder extends BulkAdder { - final FixedBitSet bitSet; + void add(DocIdSetIterator iterator) throws IOException; + } - FixedBitSetAdder(FixedBitSet bitSet) { - this.bitSet = bitSet; - } + private record FixedBitSetAdder(FixedBitSet bitSet) implements BulkAdder { @Override public void add(int doc) { bitSet.set(doc); } + @Override + public void add(IntsRef docs) { + for (int i = 0; i < docs.length; i++) { + bitSet.set(docs.ints[docs.offset + i]); + } + } + @Override public void add(DocIdSetIterator iterator) throws IOException { bitSet.or(iterator); @@ -85,17 +84,26 @@ private static class Buffer { } } - private static class BufferAdder extends BulkAdder { - final Buffer buffer; - - BufferAdder(Buffer buffer) { - this.buffer = buffer; - } + private record BufferAdder(Buffer buffer) implements BulkAdder { @Override public void add(int doc) { buffer.array[buffer.length++] = doc; } + + @Override + public void add(IntsRef docs) { + System.arraycopy(docs.ints, docs.offset, buffer.array, buffer.length, docs.length); + buffer.length += docs.length; + } + + @Override + public void add(DocIdSetIterator iterator) throws IOException { + int docID; + while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + add(docID); + } + } } private final int maxDoc; diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java index 0efcc2ef4650..a90c79a8c808 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java @@ -24,6 +24,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.MathUtil; /** @@ -146,6 +147,19 @@ public void visit(int docID) { count[0]++; } + @Override + public void visit(DocIdSetIterator iterator) throws IOException { + int docID; + while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + visit(docID); + } + } + + @Override + public void visit(IntsRef ref) { + count[0] += ref.length; + } + @Override public void visit(int docID, byte[] packedValue) { throw new AssertionError(); diff --git a/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java index 88dbf24e2d13..1d9079a203fd 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java @@ -130,13 +130,20 @@ public void testRandom() throws IOException { for (j = 0; j < array.length; ) { final int l = TestUtil.nextInt(random(), 1, array.length - j); DocIdSetBuilder.BulkAdder adder = null; - for (int k = 0, budget = 0; k < l; ++k) { - if (budget == 0 || rarely()) { - budget = TestUtil.nextInt(random(), 1, l - k + 5); - adder = builder.grow(budget); + if (usually()) { + for (int k = 0, budget = 0; k < l; ++k) { + if (budget == 0 || rarely()) { + budget = TestUtil.nextInt(random(), 1, l - k + 5); + adder = builder.grow(budget); + } + adder.add(array[j++]); + budget--; } - adder.add(array[j++]); - budget--; + } else { + IntsRef intsRef = new IntsRef(array, j, l); + adder = builder.grow(l); + adder.add(intsRef); + j += l; } } diff --git a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java index 8883fef22409..9b990b08e475 100644 --- a/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java +++ b/lucene/spatial3d/src/java/org/apache/lucene/spatial3d/PointInShapeIntersectVisitor.java @@ -27,6 +27,7 @@ import org.apache.lucene.spatial3d.geom.PlanetModel.DocValueEncoder; import org.apache.lucene.spatial3d.geom.XYZBounds; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.NumericUtils; class PointInShapeIntersectVisitor implements IntersectVisitor { @@ -67,6 +68,11 @@ public void visit(DocIdSetIterator iterator) throws IOException { adder.add(iterator); } + @Override + public void visit(IntsRef ref) throws IOException { + adder.add(ref); + } + @Override public void visit(int docID, byte[] packedValue) { assert packedValue.length == 12; From 5c91f1593a3ca750577b21ba1c17fff2b2e9cf04 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 16 Jan 2025 18:53:09 +0100 Subject: [PATCH 30/88] Fix `TestFeatureField.testStoreTermVectors` failure. (#14146) The error message is a bit different depending on whether you append to an existing `IndexingChain.PerField` object or to a new one. --- .../test/org/apache/lucene/document/TestFeatureField.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java index 33918c4d8dc8..bd0696f66d16 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.document; +import static org.hamcrest.Matchers.anyOf; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -480,7 +481,9 @@ public void testStoreTermVectors() throws Exception { FeatureField invalid = new FeatureField("features", "pagerank", 1, false); doc.add(invalid); var exc = expectThrows(Exception.class, () -> writer.addDocument(doc)); - assertThat(exc.getMessage(), containsString("store term vector")); + assertThat( + exc.getMessage(), + anyOf(containsString("store term vector"), containsString("storeTermVector"))); writer.forceMerge(1); DirectoryReader reader = writer.getReader(); From 16da44a1bb714cc6c3c34ebd2a77ea86fac29286 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 16 Jan 2025 18:53:53 +0100 Subject: [PATCH 31/88] Fix `BitSetIterator` to correctly honor the contract of `DocIdSetIterator#intoBitSet`. (#14142) `BitSetIterator#intoBitSet` would currently fail if `upTo - offset` exceeds the length of the destination bit set. However, `DocIdSetIterator#intoBitSet` only requires matching docs to be set into the bit set, so having `upTo - offset` exceed the length of the dest bit set is legal as long as no bits are set beyond `offset + bitSet.length()`. --- .../apache/lucene/util/BitSetIterator.java | 13 ++-- .../tests/util/BaseDocIdSetTestCase.java | 68 +++++++++++++++++++ 2 files changed, 76 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java b/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java index ff74c107b13c..ba55573baf0e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java @@ -100,12 +100,15 @@ public long cost() { @Override public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { - upTo = Math.min(upTo, bits.length()); if (upTo > doc && bits instanceof FixedBitSet fixedBits) { - FixedBitSet.orRange(fixedBits, doc, bitSet, doc - offset, upTo - doc); - advance(upTo); // set the current doc - } else { - super.intoBitSet(upTo, bitSet, offset); + int actualUpto = Math.min(upTo, length); + // The destination bit set may be shorter than this bit set. This is only legal if all bits + // beyond offset + bitSet.length() are clear. If not, the below call to `super.intoBitSet` + // will throw an exception. + actualUpto = (int) Math.min(actualUpto, offset + (long) bitSet.length()); + FixedBitSet.orRange(fixedBits, doc, bitSet, doc - offset, actualUpto - doc); + advance(actualUpto); // set the current doc } + super.intoBitSet(upTo, bitSet, offset); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseDocIdSetTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseDocIdSetTestCase.java index 2c1dfc72a31a..c74757b542ca 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseDocIdSetTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/BaseDocIdSetTestCase.java @@ -24,6 +24,7 @@ import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; /** Base test class for {@link DocIdSet}s. */ public abstract class BaseDocIdSetTestCase extends LuceneTestCase { @@ -196,4 +197,71 @@ private long ramBytesUsed(DocIdSet set, int length) throws IOException { long bytes2 = RamUsageTester.ramUsed(dummy); return bytes1 - bytes2; } + + public void testIntoBitSet() throws IOException { + Random random = random(); + final int numBits = TestUtil.nextInt(random, 100, 1 << 20); + // test various random sets with various load factors + for (float percentSet : new float[] {0f, 0.0001f, random.nextFloat(), 0.9f, 1f}) { + final BitSet set = randomSet(numBits, percentSet); + final T copy = copyOf(set, numBits); + int from = TestUtil.nextInt(random(), 0, numBits - 1); + int to = TestUtil.nextInt(random(), from, numBits + 5); + FixedBitSet actual = new FixedBitSet(to - from); + DocIdSetIterator it1 = copy.iterator(); + if (it1 == null) { + continue; + } + int fromDoc = it1.advance(from); + // No docs to set + it1.intoBitSet(from, actual, from); + assertTrue(actual.scanIsEmpty()); + assertEquals(fromDoc, it1.docID()); + + // Now actually set some bits + it1.intoBitSet(to, actual, from); + FixedBitSet expected = new FixedBitSet(to - from); + DocIdSetIterator it2 = copy.iterator(); + for (int doc = it2.advance(from); doc < to; doc = it2.nextDoc()) { + expected.set(doc - from); + } + assertEquals(expected, actual); + // Check if docID() / nextDoc() return the same value after #intoBitSet has been called. + assertEquals(it2.docID(), it1.docID()); + if (it2.docID() != DocIdSetIterator.NO_MORE_DOCS) { + assertEquals(it2.nextDoc(), it1.nextDoc()); + } + } + } + + public void testIntoBitSetBoundChecks() throws IOException { + final BitSet set = new BitSet(); + set.set(20); + set.set(42); + final T copy = copyOf(set, 256); + int from = TestUtil.nextInt(random(), 0, 20); + int to = TestUtil.nextInt(random(), 43, 256); + int offset = TestUtil.nextInt(random(), 0, from); + FixedBitSet dest1 = new FixedBitSet(42 - offset + 1); + DocIdSetIterator it1 = copy.iterator(); + it1.advance(from); + // This call is legal, since all "set" bits are in the range + it1.intoBitSet(to, dest1, offset); + for (int i = 0; i < dest1.length(); ++i) { + assertEquals(offset + i == 20 || offset + i == 42, dest1.get(i)); + } + + FixedBitSet dest2 = new FixedBitSet(42 - offset); + DocIdSetIterator it2 = copy.iterator(); + it2.advance(from); + // This call is not legal, since there is one bit that is set beyond the end of the target bit + // set + expectThrows(Throwable.class, () -> it2.intoBitSet(to, dest2, offset)); + + FixedBitSet dest3 = new FixedBitSet(42 - offset + 1); + DocIdSetIterator it3 = copy.iterator(); + it3.advance(from); + // This call is not legal, since offset is greater than the current doc + expectThrows(Throwable.class, () -> it3.intoBitSet(to, dest3, 21)); + } } From cad76ccfd2d7e38e3c25e1d454ff8182571ca9ab Mon Sep 17 00:00:00 2001 From: Viliam Durina Date: Thu, 16 Jan 2025 22:20:17 +0100 Subject: [PATCH 32/88] Grammar fixes in comments (#14100) --- .../Lucene99ScalarQuantizedVectorsWriter.java | 2 +- .../lucene/index/CompositeReaderContext.java | 4 ++-- .../apache/lucene/index/DirectoryReader.java | 7 +++---- .../apache/lucene/index/DocumentsWriter.java | 2 +- .../index/DocumentsWriterFlushControl.java | 2 +- .../lucene/index/DocumentsWriterPerThread.java | 2 +- .../lucene/index/IndexReaderContext.java | 2 +- .../org/apache/lucene/index/IndexWriter.java | 4 ++-- .../org/apache/lucene/index/MergePolicy.java | 4 ++-- .../org/apache/lucene/index/ReaderPool.java | 2 +- .../org/apache/lucene/index/SegmentReader.java | 2 +- .../SoftDeletesDirectoryReaderWrapper.java | 8 ++++---- .../lucene/index/StoredFieldVisitor.java | 2 +- .../apache/lucene/search/MultiCollector.java | 2 +- .../lucene/search/MultiCollectorManager.java | 18 +++++++++--------- .../apache/lucene/search/ReferenceManager.java | 4 ++-- .../org/apache/lucene/store/FSDirectory.java | 2 +- .../apache/lucene/store/FilterIndexInput.java | 2 +- .../org/apache/lucene/store/IOContext.java | 4 +--- .../lucene/store/NRTCachingDirectory.java | 4 ++-- .../lucene/store/RateLimitedIndexOutput.java | 2 +- .../vectorization/PanamaVectorUtilSupport.java | 6 +++--- .../lucene/misc/store/DirectIODirectory.java | 6 +++--- 23 files changed, 45 insertions(+), 48 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java index 1a30b5271cd7..39f3a81983b4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java @@ -223,7 +223,7 @@ public FlatFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOExceptio public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { rawVectorDelegate.mergeOneField(fieldInfo, mergeState); // Since we know we will not be searching for additional indexing, we can just write the - // the vectors directly to the new segment. + // vectors directly to the new segment. // No need to use temporary file as we don't have to re-open for reading if (fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32)) { ScalarQuantizer mergedQuantizationState = diff --git a/lucene/core/src/java/org/apache/lucene/index/CompositeReaderContext.java b/lucene/core/src/java/org/apache/lucene/index/CompositeReaderContext.java index f64a460325bc..0004df794e5c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CompositeReaderContext.java +++ b/lucene/core/src/java/org/apache/lucene/index/CompositeReaderContext.java @@ -32,8 +32,8 @@ static CompositeReaderContext create(CompositeReader reader) { } /** - * Creates a {@link CompositeReaderContext} for intermediate readers that aren't not top-level - * readers in the current context + * Creates a {@link CompositeReaderContext} for intermediate readers that aren't top-level readers + * in the current context */ CompositeReaderContext( CompositeReaderContext parent, diff --git a/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java b/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java index 7571a7f0d074..8f56ae49d3e6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/DirectoryReader.java @@ -124,7 +124,7 @@ public static DirectoryReader open(final IndexCommit commit) throws IOException /** * Expert: returns an IndexReader reading the index on the given {@link IndexCommit}. This method - * allows to open indices that were created wih a Lucene version older than N-1 provided that all + * allows to open indices that were created with a Lucene version older than N-1 provided that all * codecs for this index are available in the classpath and the segment file format used was * created with Lucene 7 or newer. Users of this API must be aware that Lucene doesn't guarantee * semantic compatibility for indices created with versions older than N-1. All backwards @@ -150,8 +150,7 @@ public static DirectoryReader open( /** * If the index has changed since the provided reader was opened, open and return a new reader; * else, return null. The new reader, if not null, will be the same type of reader as the previous - * one, ie an NRT reader will open a new NRT reader, a MultiReader will open a new MultiReader, - * etc. + * one, ie an NRT reader will open a new NRT reader etc. * *

    This method is typically far less costly than opening a fully new DirectoryReader * as it shares resources (for example sub-readers) with the provided @@ -192,7 +191,7 @@ public static DirectoryReader openIfChanged(DirectoryReader oldReader, IndexComm * never returns null). * *

    This provides "near real-time" searching, in that changes made during an {@link IndexWriter} - * session can be quickly made available for searching without closing the writer nor calling + * session can be quickly made available for searching without closing the writer or calling * {@link IndexWriter#commit}. * *

    It's near real-time because there is no hard guarantee on how quickly you can get a diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java index e32c8b20c047..71797257ee70 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -486,7 +486,7 @@ private void doFlush(DocumentsWriterPerThread flushingDWPT) throws IOException { * flush 'B' starts and freezes all deletes occurred since 'A' has * started. if 'B' finishes before 'A' we need to wait until 'A' is done * otherwise the deletes frozen by 'B' are not applied to 'A' and we - * might miss to deletes documents in 'A'. + * might miss to delete documents in 'A'. */ try { assert assertTicketQueueModification(flushingDWPT.deleteQueue); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java index 170966e8ae49..ed3b9d0698d1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java @@ -216,7 +216,7 @@ && delta < ramBufferGranularity()) { // we need to commit this under lock but calculate it outside of the lock to minimize the time // this lock is held // per document. The reason we update this under lock is that we mark DWPTs as pending without - // acquiring it's + // acquiring its // lock in #setFlushPending and this also reads the committed bytes and modifies the // flush/activeBytes. // In the future we can clean this up to be more intuitive. diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index fd6ed22bd4dd..13fe8e62fe17 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -736,7 +736,7 @@ long getLastCommittedBytesUsed() { } /** - * Commits the current {@link #ramBytesUsed()} and stores it's value for later reuse. The last + * Commits the current {@link #ramBytesUsed()} and stores its value for later reuse. The last * committed bytes used can be retrieved via {@link #getLastCommittedBytesUsed()} */ void commitLastBytesUsed(long delta) { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexReaderContext.java b/lucene/core/src/java/org/apache/lucene/index/IndexReaderContext.java index 167900575aff..3c79a5e9c7cb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexReaderContext.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexReaderContext.java @@ -19,7 +19,7 @@ import java.util.List; /** - * A struct like class that represents a hierarchical relationship between {@link IndexReader} + * A struct-like class that represents a hierarchical relationship between {@link IndexReader} * instances. */ public abstract sealed class IndexReaderContext permits CompositeReaderContext, LeafReaderContext { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index ad11476e7345..7c907b31b5dc 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -2979,7 +2979,7 @@ private List acquireWriteLocks(Directory... dirs) throws IOException { * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @throws IllegalArgumentException if addIndexes would cause the index to exceed {@link - * #MAX_DOCS}, or if the indoming index sort does not match this index's index sort + * #MAX_DOCS}, or if the incoming index sort does not match this index's index sort */ public long addIndexes(Directory... dirs) throws IOException { ensureOpen(); @@ -6029,7 +6029,7 @@ private void processEvents(boolean triggerMerge) throws IOException { /** * Interface for internal atomic events. See {@link DocumentsWriter} for details. Events are * executed concurrently and no order is guaranteed. Each event should only rely on the - * serializeability within its process method. All actions that must happen before or after a + * serializability within its process method. All actions that must happen before or after a * certain action must be encoded inside the {@link #process(IndexWriter)} method. */ @FunctionalInterface diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java index d66f5648c03d..cbea98daf58f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java @@ -756,7 +756,7 @@ public boolean useCompoundFile( /** * Return the byte size of the provided {@link SegmentCommitInfo}, prorated by percentage of - * non-deleted documents is set. + * non-deleted documents. */ protected long size(SegmentCommitInfo info, MergeContext mergeContext) throws IOException { long byteSize = info.sizeInBytes(); @@ -838,7 +838,7 @@ public void setMaxCFSSegmentSizeMB(double v) { } /** - * Returns true if the segment represented by the given CodecReader should be keep even if it's + * Returns true if the segment represented by the given CodecReader should be kept even if it's * fully deleted. This is useful for testing of for instance if the merge policy implements * retention policies for soft deletes. */ diff --git a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java index 1bac886c7b4c..b7ca6634efbf 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReaderPool.java @@ -273,7 +273,7 @@ boolean writeDocValuesUpdatesForMerge(List infos) throws IOEx } /** - * Returns a list of all currently maintained ReadersAndUpdates sorted by it's ram consumption + * Returns a list of all currently maintained ReadersAndUpdates sorted by their ram consumption * largest to smallest. This list can also contain readers that don't consume any ram at this * point i.e. don't have any updates buffered. */ diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java index 979d4a7712f5..12f48a1d98f8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java @@ -48,7 +48,7 @@ public final class SegmentReader extends CodecReader { private final SegmentCommitInfo si; // this is the original SI that IW uses internally but it's mutated behind the scenes - // and we don't want this SI to be used for anything. Yet, IW needs this to do maintainance + // and we don't want this SI to be used for anything. Yet, IW needs this to do maintenance // and lookup pooled readers etc. private final SegmentCommitInfo originalSi; private final LeafMetaData metaData; diff --git a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java index 1515c8469c17..bbe493d88d31 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SoftDeletesDirectoryReaderWrapper.java @@ -32,9 +32,9 @@ import org.apache.lucene.util.FixedBitSet; /** - * This reader filters out documents that have a doc values value in the given field and treat these - * documents as soft deleted. Hard deleted documents will also be filtered out in the life docs of - * this reader. + * This reader filters out documents that have a doc-values value in the given field and treats + * these documents as soft-deleted. Hard deleted documents will also be filtered out in the live + * docs of this reader. * * @see IndexWriterConfig#setSoftDeletesField(String) * @see IndexWriter#softUpdateDocument(Term, Iterable, Field...) @@ -68,7 +68,7 @@ private SoftDeletesDirectoryReaderWrapper(DirectoryReader in, SoftDeletesSubRead protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException { Map readerCache = new HashMap<>(); for (LeafReader reader : getSequentialSubReaders()) { - // we try to reuse the life docs instances here if the reader cache key didn't change + // we try to reuse the live docs instances here if the reader cache key didn't change if (reader instanceof SoftDeletesFilterLeafReader && reader.getReaderCacheHelper() != null) { readerCache.put( ((SoftDeletesFilterLeafReader) reader).reader.getReaderCacheHelper().getKey(), reader); diff --git a/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java b/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java index 2457f392d112..95dbf11b7209 100644 --- a/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java +++ b/lucene/core/src/java/org/apache/lucene/index/StoredFieldVisitor.java @@ -63,7 +63,7 @@ public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {} /** Process a string field. */ public void stringField(FieldInfo fieldInfo, String value) throws IOException {} - /** Process a int numeric field. */ + /** Process an int numeric field. */ public void intField(FieldInfo fieldInfo, int value) throws IOException {} /** Process a long numeric field. */ diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java b/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java index c5372f3170a4..c08f8cdee7e4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiCollector.java @@ -25,7 +25,7 @@ /** * A {@link Collector} which allows running a search with several {@link Collector}s. It offers a * static {@link #wrap} method which accepts a list of collectors and wraps them with {@link - * MultiCollector}, while filtering out the null null ones. + * MultiCollector}, while filtering out the null ones. * *

    NOTE:When mixing collectors that want to skip low-scoring hits ({@link * ScoreMode#TOP_SCORES}) with ones that require to see all hits, such as mixing {@link diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiCollectorManager.java b/lucene/core/src/java/org/apache/lucene/search/MultiCollectorManager.java index bbdfa56da156..01fa859c753d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiCollectorManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiCollectorManager.java @@ -22,8 +22,8 @@ import java.util.List; /** - * A {@link CollectorManager} implements which wrap a set of {@link CollectorManager} as {@link - * MultiCollector} acts for {@link Collector}. + * A composite {@link CollectorManager} which wraps a set of {@link CollectorManager} instances, + * akin to how {@link MultiCollector} wraps {@link Collector} instances. */ public class MultiCollectorManager implements CollectorManager { @@ -56,21 +56,21 @@ public Collector newCollector() throws IOException { } @Override - public Object[] reduce(Collection reducableCollectors) throws IOException { - final int size = reducableCollectors.size(); + public Object[] reduce(Collection reducibleCollectors) throws IOException { + final int size = reducibleCollectors.size(); final Object[] results = new Object[collectorManagers.length]; for (int i = 0; i < collectorManagers.length; i++) { - final List reducableCollector = new ArrayList<>(size); - for (Collector collector : reducableCollectors) { + final List reducibleCollector = new ArrayList<>(size); + for (Collector collector : reducibleCollectors) { // MultiCollector will not actually wrap the collector if only one is provided, so we // check the instance type here: if (collector instanceof MultiCollector) { - reducableCollector.add(((MultiCollector) collector).getCollectors()[i]); + reducibleCollector.add(((MultiCollector) collector).getCollectors()[i]); } else { - reducableCollector.add(collector); + reducibleCollector.add(collector); } } - results[i] = collectorManagers[i].reduce(reducableCollector); + results[i] = collectorManagers[i].reduce(reducibleCollector); } return results; } diff --git a/lucene/core/src/java/org/apache/lucene/search/ReferenceManager.java b/lucene/core/src/java/org/apache/lucene/search/ReferenceManager.java index 699da549d0e3..e5a669d85e07 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ReferenceManager.java +++ b/lucene/core/src/java/org/apache/lucene/search/ReferenceManager.java @@ -101,8 +101,8 @@ public final G acquire() throws IOException { if (getRefCount(ref) == 0 && current == ref) { assert ref != null; /* if we can't increment the reader but we are - still the current reference the RM is in a - illegal states since we can't make any progress + still the current reference the RM is in an + illegal state since we can't make any progress anymore. The reference is closed but the RM still holds on to it as the actual instance. This can only happen if somebody outside of the RM diff --git a/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java b/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java index 0a49cba05e49..5039c779097a 100644 --- a/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/FSDirectory.java @@ -349,7 +349,7 @@ private void privateDeleteFile(String name, boolean isPendingDelete) throws IOEx // a WindowsFSDirectory ... // LUCENE-6684: we suppress this check for Windows, since a file could be in a confusing // "pending delete" state, failing the first - // delete attempt with access denied and then apparently falsely failing here when we try ot + // delete attempt with access denied and then apparently falsely failing here when we try to // delete it again, with NSFE/FNFE } else { throw e; diff --git a/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java index 9e60a51790f9..933701b3c3de 100644 --- a/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java @@ -21,7 +21,7 @@ import org.apache.lucene.internal.tests.TestSecrets; /** - * IndexInput implementation that delegates calls to another directory. This class can be used to + * IndexInput implementation that delegates calls to another IndexInput. This class can be used to * add limitations on top of an existing {@link IndexInput} implementation or to add additional * sanity checks for tests. However, if you plan to write your own {@link IndexInput} * implementation, you should consider extending directly {@link IndexInput} or {@link DataInput} diff --git a/lucene/core/src/java/org/apache/lucene/store/IOContext.java b/lucene/core/src/java/org/apache/lucene/store/IOContext.java index 91f3822dbc13..5f341609748f 100644 --- a/lucene/core/src/java/org/apache/lucene/store/IOContext.java +++ b/lucene/core/src/java/org/apache/lucene/store/IOContext.java @@ -34,9 +34,7 @@ public record IOContext( Context context, MergeInfo mergeInfo, FlushInfo flushInfo, ReadAdvice readAdvice) { - /** - * Context is a enumerator which specifies the context in which the Directory is being used for. - */ + /** Context is an enumerator which specifies the context in which the Directory is being used. */ public enum Context { /** Context for reads and writes that are associated with a merge. */ MERGE, diff --git a/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java b/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java index 16d6aa22ce53..35a300c1763a 100644 --- a/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java @@ -56,8 +56,8 @@ * * *

    This will cache all newly flushed segments, all merges whose expected segment size is {@code - * <= 5 MB}, unless the net cached bytes exceeds 60 MB at which point all writes will not be cached - * (until the net bytes falls below 60 MB). + * <= 5 MB}, unless the net cached bytes exceed 60 MB at which point all writes will not be cached + * (until the net bytes fall below 60 MB). * * @lucene.experimental */ diff --git a/lucene/core/src/java/org/apache/lucene/store/RateLimitedIndexOutput.java b/lucene/core/src/java/org/apache/lucene/store/RateLimitedIndexOutput.java index bfac505b8b09..86c7ca3885f3 100644 --- a/lucene/core/src/java/org/apache/lucene/store/RateLimitedIndexOutput.java +++ b/lucene/core/src/java/org/apache/lucene/store/RateLimitedIndexOutput.java @@ -31,7 +31,7 @@ public final class RateLimitedIndexOutput extends FilterIndexOutput { private long bytesSinceLastPause; /** - * Cached here not not always have to call RateLimiter#getMinPauseCheckBytes() which does volatile + * Cached here to not always have to call RateLimiter#getMinPauseCheckBytes() which does volatile * read. */ private long currentMinPauseCheckBytes; diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java index 1369aa5e3f40..6b2d45d3e294 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java @@ -98,7 +98,7 @@ public float dotProduct(float[] a, float[] b) { int i = 0; float res = 0; - // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize + // if the array size is large (> 2x platform vector size), it's worth the overhead to vectorize if (a.length > 2 * FLOAT_SPECIES.length()) { i += FLOAT_SPECIES.loopBound(a.length); res += dotProductBody(a, b, i); @@ -161,7 +161,7 @@ public float cosine(float[] a, float[] b) { float norm1 = 0; float norm2 = 0; - // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize + // if the array size is large (> 2x platform vector size), it's worth the overhead to vectorize if (a.length > 2 * FLOAT_SPECIES.length()) { i += FLOAT_SPECIES.loopBound(a.length); float[] ret = cosineBody(a, b, i); @@ -226,7 +226,7 @@ public float squareDistance(float[] a, float[] b) { int i = 0; float res = 0; - // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize + // if the array size is large (> 2x platform vector size), it's worth the overhead to vectorize if (a.length > 2 * FLOAT_SPECIES.length()) { i += FLOAT_SPECIES.loopBound(a.length); res += squareDistanceBody(a, b, i); diff --git a/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java b/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java index 8b5f4fd76b77..015d0ee13136 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java @@ -148,7 +148,7 @@ protected void ensureOpen() throws AlreadyClosedException { /** * Determines if direct IO should be used for a file. By default this tests if it is a merge - * context and if the merge or file length extends the minimum size (see {@link + * context and if the merge or file length exceeds the minimum size (see {@link * #DEFAULT_MIN_BYTES_DIRECT}). Subclasses may override method to enforce direct IO for specific * file types. * @@ -213,8 +213,8 @@ private static final class DirectIOIndexOutput extends IndexOutput { * bypassing OS buffer * * @throws UnsupportedOperationException if the JDK does not support Direct I/O - * @throws IOException if the operating system or filesystem does not support support Direct I/O - * or a sufficient equivalent. + * @throws IOException if the operating system or filesystem does not support Direct I/O or a + * sufficient equivalent. */ public DirectIOIndexOutput(Path path, String name, int blockSize, int bufferSize) throws IOException { From a4434ca93c8efd2c19729d2d73195b5d570cb102 Mon Sep 17 00:00:00 2001 From: Mikhail Khludnev Date: Fri, 17 Jan 2025 14:36:58 +0300 Subject: [PATCH 33/88] javadocs: fix invalid refs in `queryparsers` #14086 (#14087) --- .../queryparser/flexible/package-info.java | 80 +----------------- .../flexible/standard/package-info.java | 81 ++++++++++++++++++- lucene/queryparser/src/java/overview.html | 6 +- 3 files changed, 81 insertions(+), 86 deletions(-) diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/package-info.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/package-info.java index 4d20996e41bd..f3c1ded1f1ff 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/package-info.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/package-info.java @@ -15,83 +15,5 @@ * limitations under the License. */ -/** - * Flexible query parser is a modular, extensible framework for implementing Lucene query parsers. - * In the flexible query parser model, query parsing takes three steps: syntax parsing, processing - * (query semantics) and building (conversion to a Lucene {@link org.apache.lucene.search.Query}). - * - *

    The flexible query parser module provides not just the framework but also the {@linkplain - * org.apache.lucene.queryparser.flexible.standard.StandardQueryParser} - the default implementation - * of a fully fledged query parser that supports most of the classic query parser's syntax but also - * adds support for interval functions, min-should-match operator on Boolean groups and many hooks - * for customization of how the parser behaves at runtime. - * - *

    The flexible query parser is divided in two packages: - * - *

      - *
    • {@link org.apache.lucene.queryparser.flexible.core}: contains the query parser API classes, - * which should be extended by custom query parser implementations. - *
    • {@link org.apache.lucene.queryparser.flexible.standard}: contains an example Lucene query - * parser implementation built on top of the flexible query parser API. - *
    - * - *

    Features

    - * - *
      - *
    1. full support for Boolean expressions, including groups - *
    2. {@linkplain org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser syntax parsers} - * - support for arbitrary syntax parsers, that can be converted into {@link - * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} trees. - *
    3. {@linkplain org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor query - * node processors} - optimize, validate, rewrite the {@link - * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} trees - *
    4. {@linkplain - * org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorPipeline processor - * pipelines} - select your favorite query processors and build a pipeline to implement the - * features you need. - *
    5. {@linkplain org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler query - * configuration handlers} - *
    6. {@linkplain org.apache.lucene.queryparser.flexible.core.builders.QueryBuilder query - * builders} - convert {@link org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} - * trees into Lucene {@link org.apache.lucene.search.Query} instances. - *
    - * - *

    Design

    - * - *

    The flexible query parser was designed to have a very generic architecture, so that it can be - * easily used for different products with varying query syntax needs. - * - *

    The query parser has three layers and its core is what we call the {@linkplain - * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode query node tree}. It is a tree of - * objects that represent the syntax of the original query, for example, for 'a AND b' the tree - * could look like this: - * - *

    - *       AND
    - *      /   \
    - *     A     B
    - * 
    - * - *

    The three flexible query parser layers are: - * - *

    - *
    {@link org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser} - *
    This layer is the text parsing layer which simply transforms the query text string into a - * {@link org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} tree. Every text parser - * must implement the interface {@link - * org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser}. The default - * implementation is {@link - * org.apache.lucene.queryparser.flexible.standard.parser.StandardSyntaxParser}. - *
    {@link org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor} - *
    The query node processor does most of the work: it contains a chain of {@linkplain - * org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor query node - * processors}. Each processor can walk the tree and modify nodes or even the tree's - * structure. This allows for query optimization before the node tree is converted to an - * actual query. - *
    {@link org.apache.lucene.queryparser.flexible.core.builders.QueryBuilder} - *
    The third layer is a configurable map of builders, which map {@linkplain - * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode query nodes} to their adapters - * that convert each node into a {@link org.apache.lucene.search.Query}. - *
    - */ +/** */ package org.apache.lucene.queryparser.flexible; diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/package-info.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/package-info.java index 569df7a029cf..9d02e8aff020 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/package-info.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/package-info.java @@ -22,10 +22,83 @@ * operations. In the new query parser structure, the parsing was divided in 3 steps: parsing * (syntax), processing (semantic) and building. * - *

    The classes contained in the package org.apache.lucene.queryParser.standard are used to - * reproduce the same behavior as the old query parser. + *

    Flexible query parser is a modular, extensible framework for implementing Lucene query + * parsers. In the flexible query parser model, query parsing takes three steps: syntax parsing, + * processing (query semantics) and building (conversion to a Lucene {@link + * org.apache.lucene.search.Query}). * - *

    Check {@link org.apache.lucene.queryparser.flexible.standard.StandardQueryParser} to quick - * start using the Lucene query parser. + *

    The flexible query parser module provides not just the framework but also the {@linkplain + * org.apache.lucene.queryparser.flexible.standard.StandardQueryParser} - the default implementation + * of a fully fledged query parser that supports most of the classic query parser's syntax but also + * adds support for interval functions, min-should-match operator on Boolean groups and many hooks + * for customization of how the parser behaves at runtime. + * + *

    The flexible query parser is divided in two packages: + * + *

      + *
    • {@link org.apache.lucene.queryparser.flexible.core}: contains the query parser API classes, + * which should be extended by custom query parser implementations. + *
    • {@link org.apache.lucene.queryparser.flexible.standard}: contains an example Lucene query + * parser implementation built on top of the flexible query parser API. + *
    + * + *

    Features

    + * + *
      + *
    1. full support for Boolean expressions, including groups + *
    2. {@linkplain org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser syntax parsers} + * - support for arbitrary syntax parsers, that can be converted into {@link + * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} trees. + *
    3. {@linkplain org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor query + * node processors} - optimize, validate, rewrite the {@link + * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} trees + *
    4. {@linkplain + * org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessorPipeline processor + * pipelines} - select your favorite query processors and build a pipeline to implement the + * features you need. + *
    5. {@linkplain org.apache.lucene.queryparser.flexible.core.config.QueryConfigHandler query + * configuration handlers} + *
    6. {@linkplain org.apache.lucene.queryparser.flexible.core.builders.QueryBuilder query + * builders} - convert {@link org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} + * trees into Lucene {@link org.apache.lucene.search.Query} instances. + *
    + * + *

    Design

    + * + *

    The flexible query parser was designed to have a very generic architecture, so that it can be + * easily used for different products with varying query syntax needs. + * + *

    The query parser has three layers and its core is what we call the {@linkplain + * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode query node tree}. It is a tree of + * objects that represent the syntax of the original query, for example, for 'a AND b' the tree + * could look like this: + * + *

    + *       AND
    + *      /   \
    + *     A     B
    + * 
    + * + *

    The three flexible query parser layers are: + * + *

    + *
    {@link org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser} + *
    This layer is the text parsing layer which simply transforms the query text string into a + * {@link org.apache.lucene.queryparser.flexible.core.nodes.QueryNode} tree. Every text parser + * must implement the interface {@link + * org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser}. The default + * implementation is {@link + * org.apache.lucene.queryparser.flexible.standard.parser.StandardSyntaxParser}. + *
    {@link org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor} + *
    The query node processor does most of the work: it contains a chain of {@linkplain + * org.apache.lucene.queryparser.flexible.core.processors.QueryNodeProcessor query node + * processors}. Each processor can walk the tree and modify nodes or even the tree's + * structure. This allows for query optimization before the node tree is converted to an + * actual query. + *
    {@link org.apache.lucene.queryparser.flexible.core.builders.QueryBuilder} + *
    The third layer is a configurable map of builders, which map {@linkplain + * org.apache.lucene.queryparser.flexible.core.nodes.QueryNode query nodes} to their adapters + * that convert each node into a {@link org.apache.lucene.search.Query}. + *
    */ package org.apache.lucene.queryparser.flexible.standard; diff --git a/lucene/queryparser/src/java/overview.html b/lucene/queryparser/src/java/overview.html index 2b6f8a446afb..a7c579dd836f 100644 --- a/lucene/queryparser/src/java/overview.html +++ b/lucene/queryparser/src/java/overview.html @@ -27,16 +27,16 @@

    Apache Lucene QueryParsers.

    This module provides a number of query parsers:

      -
    • {@linkplain org.apache.lucene.queryparser.flexible flexible query parser} +
    • {@linkplain org.apache.lucene.queryparser.flexible.standard flexible query parser}
    • {@linkplain org.apache.lucene.queryparser.classic classic query parser}
    • {@linkplain org.apache.lucene.queryparser.complexPhrase complex phrase query parser}
    • {@linkplain org.apache.lucene.queryparser.ext extendable query parser} -
    • {@linkplain org.apache.lucene.queryparser.surround surround query parser (span queries)} +
    • {@linkplain org.apache.lucene.queryparser.surround.parser surround query parser (span queries)}
    • {@linkplain org.apache.lucene.queryparser.xml query parser building Query objects from XML}

    - If you're new to query parsers, the {@linkplain org.apache.lucene.queryparser.flexible flexible query parser}'s + If you're new to query parsers, the {@linkplain org.apache.lucene.queryparser.flexible.standard flexible query parser}'s {@link org.apache.lucene.queryparser.flexible.standard.StandardQueryParser} is probably a good place to start. From 4bca45fb3b237790e32f9e8e7b3c2f0108c5bd84 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Fri, 17 Jan 2025 21:03:06 +0530 Subject: [PATCH 34/88] Adding Javadocs to some public methods --- lucene/licenses/cuvs-java-25.02.jar.sha1 | 2 +- .../sandbox/vectorsearch/CagraFieldVectorsWriter.java | 3 +++ .../apache/lucene/sandbox/vectorsearch/CuVSCodec.java | 3 +++ .../apache/lucene/sandbox/vectorsearch/CuVSIndex.java | 3 +++ .../sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java | 3 +++ .../lucene/sandbox/vectorsearch/CuVSSegmentFile.java | 3 +++ .../lucene/sandbox/vectorsearch/CuVSVectorsFormat.java | 3 +++ .../lucene/sandbox/vectorsearch/CuVSVectorsReader.java | 3 +++ .../lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 9 +++++++++ .../sandbox/vectorsearch/PerLeafCuVSKnnCollector.java | 3 +++ .../lucene/sandbox/vectorsearch/SegmentInputStream.java | 3 +++ .../org/apache/lucene/sandbox/vectorsearch/Util.java | 3 +++ .../apache/lucene/sandbox/vectorsearch/package-info.java | 4 ++++ 13 files changed, 44 insertions(+), 1 deletion(-) diff --git a/lucene/licenses/cuvs-java-25.02.jar.sha1 b/lucene/licenses/cuvs-java-25.02.jar.sha1 index e399aed842a5..42b4dae43805 100644 --- a/lucene/licenses/cuvs-java-25.02.jar.sha1 +++ b/lucene/licenses/cuvs-java-25.02.jar.sha1 @@ -1 +1 @@ -280c6f97d99a8d32500a0c0891db1ccdc49bc17b +870f2aed1a4633489cc9c3d33128683e668a0f30 diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index df8f83966dc3..6940b9bfeea6 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -22,6 +22,9 @@ import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.index.FieldInfo; +/** + * CuVS based fields writer + */ public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { public final String fieldName; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index 315923d1eeb2..1e3c85d746ef 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -24,6 +24,9 @@ import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; +/** + * CuVS based codec for GPU based vector search + */ public class CuVSCodec extends FilterCodec { public CuVSCodec() { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 98a2eb9739ac..6d2a4e281911 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -21,6 +21,9 @@ import java.util.List; import java.util.Objects; +/** + * This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) + */ public class CuVSIndex { private final CagraIndex cagraIndex; private final BruteForceIndex bruteforceIndex; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java index e4df14208f97..e4ce49fb84f7 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -24,6 +24,9 @@ import org.apache.lucene.search.knn.KnnCollectorManager; import org.apache.lucene.util.Bits; +/** + * Query for CuVS + */ public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { private final int iTopK; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java index 7b850daa6662..e6be4726f16e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -26,6 +26,9 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; +/** + * Methods to deal with a CuVS composite file inside a segment + */ public class CuVSSegmentFile implements AutoCloseable { private final ZipOutputStream zos; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index e2b5bc2169f5..e3928a31b050 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -24,6 +24,9 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; +/** + * CuVS based KnnVectorsFormat for GPU acceleration + */ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index d7e8a5f19b08..b41e5c08f177 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -51,6 +51,9 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; +/** + * KnnVectorsReader instance associated with CuVS format + */ public class CuVSVectorsReader extends KnnVectorsReader { // protected Logger log = Logger.getLogger(getClass().getName()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index d5c155ca7212..bb40b7119a0e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -42,6 +42,9 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.SuppressForbidden; +/** + * KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU + */ public class CuVSVectorsWriter extends KnnVectorsWriter { // protected Logger log = Logger.getLogger(getClass().getName()); @@ -60,6 +63,9 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { private MergeStrategy mergeStrategy; private CuVSResources resources; + /** + * Merge strategy used for CuVS + */ public enum MergeStrategy { TRIVIAL_MERGE, NON_TRIVIAL_MERGE @@ -365,6 +371,9 @@ public void finish() throws IOException { } } + /** + * OutputStream for writing into an IndexOutput + */ public class SegmentOutputStream extends OutputStream { IndexOutput out; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java index 3c96aa37325b..a1473c4acf20 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -23,6 +23,9 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; +/** + * KnnCollector for CuVS + */ public class PerLeafCuVSKnnCollector implements KnnCollector { public List scoreDocs; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java index 787d7c81cc61..47c6d3c3cedf 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -20,6 +20,9 @@ import java.io.InputStream; import org.apache.lucene.store.IndexInput; +/** + * InputStream semantics for reading from an IndexInput + */ public class SegmentInputStream extends InputStream { /** */ diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java index 1ffb75037609..dfe60b29ea27 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -25,6 +25,9 @@ import java.util.zip.ZipInputStream; import org.apache.commons.lang3.SerializationUtils; +/** + * Some Utils used in CuVS integration + */ public class Util { public static ByteArrayOutputStream getZipEntryBAOS( diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java index ce9cd8cc52d2..a11c94e7224b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -14,4 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +/** + * CuVS based fast vector search + */ package org.apache.lucene.sandbox.vectorsearch; From dce24c8621748d51954c4845eab7626dae166db9 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Sat, 18 Jan 2025 11:41:25 +0100 Subject: [PATCH 35/88] Fix negative cost in some IntersectVisitor implementations after #14138 (#14150) Introduced in #14138, we need to prevent negative scores when subtracting the added hits in LatLonPointDistanceQuery, SpatialQuery and PointRangeQuery. --- .../org/apache/lucene/document/LatLonPointDistanceQuery.java | 2 +- .../core/src/java/org/apache/lucene/document/SpatialQuery.java | 2 +- .../core/src/java/org/apache/lucene/search/PointRangeQuery.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java b/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java index 9cebb8e73014..4e816ffa6259 100644 --- a/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/LatLonPointDistanceQuery.java @@ -280,7 +280,7 @@ public void visit(IntsRef ref) { for (int i = 0; i < ref.length; i++) { result.clear(ref.ints[ref.offset + i]); } - cost[0] = -ref.length; + cost[0] = Math.max(0, cost[0] - ref.length); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java b/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java index cc233f89948d..4caf06526869 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/SpatialQuery.java @@ -677,7 +677,7 @@ public void visit(IntsRef ref) { for (int i = 0; i < ref.length; i++) { result.clear(ref.ints[ref.offset + i]); } - cost[0] -= ref.length; + cost[0] = Math.max(0, cost[0] - ref.length); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java index e5d956e8d1ee..97ae34713f86 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java @@ -233,7 +233,7 @@ public void visit(IntsRef ref) { for (int i = ref.offset; i < ref.offset + ref.length; i++) { result.clear(ref.ints[i]); } - cost[0] -= ref.length; + cost[0] = Math.max(0, cost[0] - ref.length); } @Override From 7e20d5b3aad13739a2ae9fa36774e68ccae43ac4 Mon Sep 17 00:00:00 2001 From: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com> Date: Mon, 20 Jan 2025 10:33:31 +0100 Subject: [PATCH 36/88] Avoid overflow in index input slices invariant checks (#14126) This commit avoids overflow in index input slices invariant checks. While not a problem in practice, this can lead to more obscure failures which are harder to diagnose. Reworking the invariant checks to avoid the potential to overflow is trivial. Existing tests cover the most cases, while a single new scenario covered the overflow case. --- lucene/CHANGES.txt | 3 +++ .../java/org/apache/lucene/store/BufferedIndexInput.java | 2 +- .../java/org/apache/lucene/store/ByteBuffersDataInput.java | 2 +- .../src/java/org/apache/lucene/store/NIOFSDirectory.java | 2 +- .../org/apache/lucene/store/MemorySegmentIndexInput.java | 2 +- .../src/java/org/apache/lucene/misc/store/RAFDirectory.java | 2 +- .../apache/lucene/tests/store/BaseDirectoryTestCase.java | 6 ++++++ .../lucene/tests/store/SerialIOCountingDirectory.java | 2 +- 8 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c2b37a94f895..39d7403a11c4 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -84,6 +84,9 @@ Bug Fixes * GITHUB#14123: SortingCodecReader NPE when segment has no (points, vectors, etc...) (Mike Sokolov) +* GITHUB#14126: Avoid overflow in index input slices invariant checks + (Chris Hegarty) + Other --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index 1738259fa2fb..cd47500a2df7 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -401,7 +401,7 @@ private static final class SlicedIndexInput extends BufferedIndexInput { ? base.toString() : (base.toString() + " [slice=" + sliceDescription + "]"), BufferedIndexInput.BUFFER_SIZE); - if (offset < 0 || length < 0 || offset + length > base.length()) { + if ((length | offset) < 0 || length > base.length() - offset) { throw new IllegalArgumentException( "slice() " + sliceDescription + " out of bounds: " + base); } diff --git a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java index 39e920616209..dee5c8e3a738 100644 --- a/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/ByteBuffersDataInput.java @@ -424,7 +424,7 @@ public void skipBytes(long numBytes) throws IOException { } public ByteBuffersDataInput slice(long offset, long length) { - if (offset < 0 || length < 0 || offset + length > this.length) { + if ((length | offset) < 0 || length > this.length - offset) { throw new IllegalArgumentException( String.format( Locale.ROOT, diff --git a/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java b/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java index c9c92db91b40..b05652789cfa 100644 --- a/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/NIOFSDirectory.java @@ -139,7 +139,7 @@ public NIOFSIndexInput clone() { @Override public IndexInput slice(String sliceDescription, long offset, long length) throws IOException { - if (offset < 0 || length < 0 || offset + length > this.length()) { + if ((length | offset) < 0 || length > this.length() - offset) { throw new IllegalArgumentException( "slice() " + sliceDescription diff --git a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java index 74594be5ec99..800a66a2167e 100644 --- a/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java +++ b/lucene/core/src/java21/org/apache/lucene/store/MemorySegmentIndexInput.java @@ -597,7 +597,7 @@ public final MemorySegmentIndexInput clone() { */ @Override public final MemorySegmentIndexInput slice(String sliceDescription, long offset, long length) { - if (offset < 0 || length < 0 || offset + length > this.length) { + if ((length | offset) < 0 || length > this.length - offset) { throw new IllegalArgumentException( "slice() " + sliceDescription diff --git a/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java b/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java index 420d6d40d6de..cd90db9abbe7 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/store/RAFDirectory.java @@ -136,7 +136,7 @@ public RAFIndexInput clone() { @Override public IndexInput slice(String sliceDescription, long offset, long length) throws IOException { - if (offset < 0 || length < 0 || offset + length > this.length()) { + if ((length | offset) < 0 || length > this.length() - offset) { throw new IllegalArgumentException( "slice() " + sliceDescription + " out of bounds: " + this); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java index 6defa5eb8c7a..88223a4abebe 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java @@ -771,6 +771,12 @@ public void testSliceOutOfBounds() throws Exception { slice.slice("slice3sub", 1, len / 2); }); + expectThrows( + IllegalArgumentException.class, + () -> { + i.slice("slice4", Long.MAX_VALUE - 1, 10); + }); + i.close(); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SerialIOCountingDirectory.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SerialIOCountingDirectory.java index 1b4234c3d79f..4d3c233257c8 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/SerialIOCountingDirectory.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/SerialIOCountingDirectory.java @@ -194,7 +194,7 @@ public IndexInput slice(String sliceDescription, long offset, long length) throw public IndexInput slice( String sliceDescription, long offset, long length, ReadAdvice readAdvice) throws IOException { - if (offset < 0 || offset + length > sliceLength) { + if ((length | offset) < 0 || length > sliceLength - offset) { throw new IllegalArgumentException(); } IndexInput clone = in.clone(); From 04eecb0483ee3be1fdaa302e27443977914a870d Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 21 Jan 2025 10:38:05 +0100 Subject: [PATCH 37/88] Replace special-casing of `DocBaseBitSetIterator` with `#intoBitSet`. (#14139) This takes advantage of the new `#intoBitSet` API to remove special casing of `DocBaseBitSetIterator` in `FixedBitSet#or(DocIdSetIterator)`. --- .../lucene/util/DocBaseBitSetIterator.java | 15 +++++++++++ .../org/apache/lucene/util/FixedBitSet.java | 27 +++---------------- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java b/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java index 841149f4febe..cf9bf5432b46 100644 --- a/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/util/DocBaseBitSetIterator.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.util; +import java.io.IOException; import org.apache.lucene.search.DocIdSetIterator; /** @@ -89,4 +90,18 @@ public int advance(int target) { public long cost() { return cost; } + + @Override + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + int actualUpto = Math.min(upTo, length); + // The destination bit set may be shorter than this bit set. This is only legal if all bits + // beyond offset + bitSet.length() are clear. If not, the below call to `super.intoBitSet` will + // throw an exception. + actualUpto = (int) Math.min(actualUpto, offset + (long) bitSet.length()); + if (actualUpto > doc) { + FixedBitSet.orRange(bits, doc - docBase, bitSet, doc - offset, actualUpto - doc); + advance(actualUpto); // set the current doc + } + super.intoBitSet(upTo, bitSet, offset); + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java index 1b6954d2eb66..9867582dd522 100644 --- a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java @@ -339,30 +339,9 @@ public int prevSetBit(int index) { @Override public void or(DocIdSetIterator iter) throws IOException { - if (iter instanceof DocBaseBitSetIterator) { - // TODO: implement DocBaseBitSetIterator#intoBitSet instead - checkUnpositioned(iter); - DocBaseBitSetIterator baseIter = (DocBaseBitSetIterator) iter; - or(baseIter.getDocBase() >> 6, baseIter.getBitSet()); - } else { - checkUnpositioned(iter); - iter.nextDoc(); - iter.intoBitSet(DocIdSetIterator.NO_MORE_DOCS, this, 0); - } - } - - private void or(final int otherOffsetWords, FixedBitSet other) { - or(otherOffsetWords, other.bits, other.numWords); - } - - private void or(final int otherOffsetWords, final long[] otherArr, final int otherNumWords) { - assert otherNumWords + otherOffsetWords <= numWords - : "numWords=" + numWords + ", otherNumWords=" + otherNumWords; - int pos = Math.min(numWords - otherOffsetWords, otherNumWords); - final long[] thisArr = this.bits; - while (--pos >= 0) { - thisArr[pos + otherOffsetWords] |= otherArr[pos]; - } + checkUnpositioned(iter); + iter.nextDoc(); + iter.intoBitSet(DocIdSetIterator.NO_MORE_DOCS, this, 0); } /** Read {@code numBits} (between 1 and 63) bits from {@code bitSet} at {@code from}. */ From f2e7ae40af0b28b1d5f2edc31f8858229a8523f4 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 21 Jan 2025 10:39:32 +0100 Subject: [PATCH 38/88] Implement #intoBitSet on `IntArrayDocIdSet` and `RoaringDocIdSet`. (#14135) These doc id sets can implement `#intoBitSet` in a way that auto-vectorizes. For reference, `RoaringDocIdSet` is used by the query cache, and `IntArrayDocIdSet` is used by point queries. --- .../lucene/search/DocIdSetIterator.java | 2 ++ .../apache/lucene/util/IntArrayDocIdSet.java | 15 ++++++++ .../apache/lucene/util/RoaringDocIdSet.java | 34 +++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java index e0bee1da2314..421323440865 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdSetIterator.java @@ -228,6 +228,8 @@ protected final int slowAdvance(int target) throws IOException { * *

    Note: It is important not to clear bits from {@code bitSet} that may be already set. * + *

    Note: {@code offset} may be negative. + * * @lucene.internal */ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java b/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java index d44cc7839233..4f764b37dfd9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/IntArrayDocIdSet.java @@ -95,6 +95,21 @@ public int advance(int target) throws IOException { return doc = docs[i++]; } + @Override + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + if (doc >= upTo) { + return; + } + + int from = i - 1; + int to = VectorUtil.findNextGEQ(docs, upTo, from, length); + for (int i = from; i < to; ++i) { + bitSet.set(docs[i] - offset); + } + doc = docs[to]; + i = to + 1; + } + @Override public long cost() { return length; diff --git a/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java b/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java index ccd92a74250e..77038dd07eda 100644 --- a/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/RoaringDocIdSet.java @@ -217,6 +217,20 @@ public int advance(int target) throws IOException { return doc = docId(i); } } + + @Override + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + if (doc >= upTo) { + return; + } + + int from = i; + advance(upTo); + int to = i; + for (int i = from; i < to; ++i) { + bitSet.set(docId(i) - offset); + } + } }; } } @@ -312,6 +326,26 @@ private int firstDocFromNextBlock() throws IOException { } } + @Override + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + for (; ; ) { + int subUpto = upTo - (block << 16); + if (subUpto < 0) { + break; + } + int subOffset = offset - (block << 16); + sub.intoBitSet(subUpto, bitSet, subOffset); + if (sub.docID() == NO_MORE_DOCS) { + if (firstDocFromNextBlock() == NO_MORE_DOCS) { + break; + } + } else { + doc = (block << 16) | sub.docID(); + break; + } + } + } + @Override public long cost() { return cardinality; From a0adeb9fa06017621471d8204e47185cc92d9fa3 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Wed, 22 Jan 2025 08:00:47 +0100 Subject: [PATCH 39/88] Fix TestBpVectorReorderer#testIndexReorderDense (#14153) This commit recomputes the vector list and adds a mapping between the stored id and the document id after the merge. It is then when we compute the DocMap. --- .../misc/index/TestBpVectorReorderer.java | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java index e4398a76183a..3b484ed2430a 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java @@ -318,10 +318,7 @@ && angularDifference(t0min, t0max) < angularDifference(t0min, t1max)) public void testIndexReorderDense() throws Exception { List vectors = shuffleVectors(randomLinearVectors()); - // compute the expected ordering - Sorter.DocMap expected = - reorderer.computeValueMap( - FloatVectorValues.fromFloats(vectors, 2), VectorSimilarityFunction.EUCLIDEAN, null); + Path tmpdir = createTempDir(); try (Directory dir = newFSDirectory(tmpdir)) { // create an index with a single leaf @@ -335,6 +332,28 @@ public void testIndexReorderDense() throws Exception { } writer.forceMerge(1); } + + // The docId of the documents might have changed due to merging. Compute a mapping from + // the stored id to the current docId and repopulate the vector list. + int[] storedIdToDocId = new int[vectors.size()]; + vectors.clear(); + try (IndexReader reader = DirectoryReader.open(dir)) { + LeafReader leafReader = getOnlyLeafReader(reader); + FloatVectorValues values = leafReader.getFloatVectorValues("f"); + StoredFields storedFields = reader.storedFields(); + KnnVectorValues.DocIndexIterator it = values.iterator(); + while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + int storedId = Integer.parseInt(storedFields.document(it.docID()).get("id")); + vectors.add(values.vectorValue(it.index()).clone()); + storedIdToDocId[storedId] = it.docID(); + } + } + + // compute the expected ordering + Sorter.DocMap expected = + reorderer.computeValueMap( + FloatVectorValues.fromFloats(vectors, 2), VectorSimilarityFunction.EUCLIDEAN, null); + int threadCount = random().nextInt(4) + 1; threadCount = 1; // reorder using the index reordering tool @@ -355,12 +374,13 @@ public void testIndexReorderDense() throws Exception { StoredFields storedFields = reader.storedFields(); KnnVectorValues.DocIndexIterator it = values.iterator(); while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - int storedId = Integer.parseInt(storedFields.document(it.docID()).get("id")); - assertEquals(expected.oldToNew(storedId), newId); + int oldDocId = + storedIdToDocId[Integer.parseInt(storedFields.document(it.docID()).get("id"))]; + assertEquals(expected.oldToNew(oldDocId), newId); float[] expectedVector = vectors.get(expected.newToOld(it.docID())); float[] actualVector = values.vectorValue(it.index()); assertArrayEquals( - "values differ at index " + storedId + "->" + newId + " docid=" + it.docID(), + "values differ at index " + oldDocId + "->" + newId + " docid=" + it.docID(), expectedVector, actualVector, 0); From c8160b16f6a9936b5666644106831dce6d3d9110 Mon Sep 17 00:00:00 2001 From: Tinker Xiao Date: Wed, 22 Jan 2025 03:48:40 -0800 Subject: [PATCH 40/88] Improve set deletions percentage javadoc (#12828) Co-authored-by: Yuan,Xiao --- .../java/org/apache/lucene/index/TieredMergePolicy.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java index b487012cc7d9..7b56471c8bf4 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java @@ -157,9 +157,10 @@ public double getMaxMergedSegmentMB() { } /** - * Controls the maximum percentage of deleted documents that is tolerated in the index. Lower - * values make the index more space efficient at the expense of increased CPU and I/O activity. - * Values must be between 5 and 50. Default value is 20. + * Sets the maximum percentage of doc id space taken by deleted docs. The denominator includes + * both active and deleted documents. Lower values make the index more space efficient at the + * expense of increased CPU and I/O activity. Values must be between 5 and 50. Default value is + * 20. * *

    When the maximum delete percentage is lowered, the indexing thread will call for merges more * often, meaning that write amplification factor will be increased. Write amplification factor From 2240adbf7a48cd5234d02a941b1764504dbbdf07 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Wed, 22 Jan 2025 08:11:49 -0500 Subject: [PATCH 41/88] Revert TestManyKnnDocs changes from #14084 (#14158) --- .../lucene/document/TestManyKnnDocs.java | 136 ++---------------- 1 file changed, 8 insertions(+), 128 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java index 1e485515a62b..2023ee73391d 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java @@ -17,7 +17,6 @@ package org.apache.lucene.document; import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; -import java.nio.file.Path; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -25,27 +24,19 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnFloatVectorQuery; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.MatchNoDocsQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.SeededKnnFloatVectorQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.tests.codecs.vector.ConfigurableMCodec; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.LuceneTestCase.Monster; -import org.junit.BeforeClass; @TimeoutSuite(millis = 86_400_000) // 24 hour timeout @Monster("takes ~10 minutes and needs extra heap, disk space, file handles") public class TestManyKnnDocs extends LuceneTestCase { // gradlew -p lucene/core test --tests TestManyKnnDocs -Ptests.heapsize=16g -Dtests.monster=true - private static Path testDir; - - @BeforeClass - public static void init_index() throws Exception { + public void testLargeSegment() throws Exception { IndexWriterConfig iwc = new IndexWriterConfig(); iwc.setCodec( new ConfigurableMCodec( @@ -55,138 +46,27 @@ public static void init_index() throws Exception { mp.setMaxMergeAtOnce(256); // avoid intermediate merges (waste of time with HNSW?) mp.setSegmentsPerTier(256); // only merge once at the end when we ask iwc.setMergePolicy(mp); + String fieldName = "field"; VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.DOT_PRODUCT; - try (Directory dir = FSDirectory.open(testDir = createTempDir("ManyKnnVectorDocs")); + try (Directory dir = FSDirectory.open(createTempDir("ManyKnnVectorDocs")); IndexWriter iw = new IndexWriter(dir, iwc)) { int numVectors = 2088992; + float[] vector = new float[1]; + Document doc = new Document(); + doc.add(new KnnFloatVectorField(fieldName, vector, similarityFunction)); for (int i = 0; i < numVectors; i++) { - float[] vector = new float[1]; - Document doc = new Document(); vector[0] = (i % 256); - doc.add(new KnnFloatVectorField("field", vector, similarityFunction)); - doc.add(new KeywordField("int", "" + i, org.apache.lucene.document.Field.Store.YES)); - doc.add(new StoredField("intValue", i)); iw.addDocument(doc); } // merge to single segment and then verify iw.forceMerge(1); iw.commit(); - } - } - - public void testLargeSegmentKnn() throws Exception { - try (Directory dir = FSDirectory.open(testDir)) { IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); - for (int i = 0; i < 256; i++) { - Query filterQuery = new MatchAllDocsQuery(); - float[] vector = new float[128]; - vector[0] = i; - vector[1] = 1; - TopDocs docs = - searcher.search(new KnnFloatVectorQuery("field", vector, 10, filterQuery), 5); - assertEquals(5, docs.scoreDocs.length); - Document d = searcher.storedFields().document(docs.scoreDocs[0].doc); - String s = ""; - for (int j = 0; j < docs.scoreDocs.length - 1; j++) { - s += docs.scoreDocs[j].doc + " " + docs.scoreDocs[j].score + "\n"; - } - assertEquals(s, i + 256, d.getField("intValue").numericValue()); - } - } - } - - public void testLargeSegmentSeededExact() throws Exception { - try (Directory dir = FSDirectory.open(testDir)) { - IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); - for (int i = 0; i < 256; i++) { - Query seedQuery = KeywordField.newExactQuery("int", "" + (i + 256)); - Query filterQuery = new MatchAllDocsQuery(); - float[] vector = new float[128]; - vector[0] = i; - vector[1] = 1; - TopDocs docs = - searcher.search( - new SeededKnnFloatVectorQuery("field", vector, 10, filterQuery, seedQuery), 5); - assertEquals(5, docs.scoreDocs.length); - String s = ""; - for (int j = 0; j < docs.scoreDocs.length - 1; j++) { - s += docs.scoreDocs[j].doc + " " + docs.scoreDocs[j].score + "\n"; - } - Document d = searcher.storedFields().document(docs.scoreDocs[0].doc); - assertEquals(s, i + 256, d.getField("intValue").numericValue()); - } - } - } - - public void testLargeSegmentSeededNearby() throws Exception { - try (Directory dir = FSDirectory.open(testDir)) { - IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); - for (int i = 0; i < 256; i++) { - Query seedQuery = KeywordField.newExactQuery("int", "" + i); - Query filterQuery = new MatchAllDocsQuery(); - float[] vector = new float[128]; - vector[0] = i; - vector[1] = 1; - TopDocs docs = - searcher.search( - new SeededKnnFloatVectorQuery("field", vector, 10, filterQuery, seedQuery), 5); - assertEquals(5, docs.scoreDocs.length); - String s = ""; - for (int j = 0; j < docs.scoreDocs.length - 1; j++) { - s += docs.scoreDocs[j].doc + " " + docs.scoreDocs[j].score + "\n"; - } - Document d = searcher.storedFields().document(docs.scoreDocs[0].doc); - assertEquals(s, i + 256, d.getField("intValue").numericValue()); - } - } - } - - public void testLargeSegmentSeededDistant() throws Exception { - try (Directory dir = FSDirectory.open(testDir)) { - IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); - for (int i = 0; i < 256; i++) { - Query seedQuery = KeywordField.newExactQuery("int", "" + (i + 128)); - Query filterQuery = new MatchAllDocsQuery(); - float[] vector = new float[128]; - vector[0] = i; - vector[1] = 1; - TopDocs docs = - searcher.search( - new SeededKnnFloatVectorQuery("field", vector, 10, filterQuery, seedQuery), 5); - assertEquals(5, docs.scoreDocs.length); - Document d = searcher.storedFields().document(docs.scoreDocs[0].doc); - String s = ""; - for (int j = 0; j < docs.scoreDocs.length - 1; j++) { - s += docs.scoreDocs[j].doc + " " + docs.scoreDocs[j].score + "\n"; - } - assertEquals(s, i + 256, d.getField("intValue").numericValue()); - } - } - } - - public void testLargeSegmentSeededNone() throws Exception { - try (Directory dir = FSDirectory.open(testDir)) { - IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); - for (int i = 0; i < 256; i++) { - Query seedQuery = new MatchNoDocsQuery(); - Query filterQuery = new MatchAllDocsQuery(); - float[] vector = new float[128]; - vector[0] = i; - vector[1] = 1; - TopDocs docs = - searcher.search( - new SeededKnnFloatVectorQuery("field", vector, 10, filterQuery, seedQuery), 5); - assertEquals(5, docs.scoreDocs.length); - Document d = searcher.storedFields().document(docs.scoreDocs[0].doc); - String s = ""; - for (int j = 0; j < docs.scoreDocs.length - 1; j++) { - s += docs.scoreDocs[j].doc + " " + docs.scoreDocs[j].score + "\n"; - } - assertEquals(s, i + 256, d.getField("intValue").numericValue()); - } + TopDocs docs = searcher.search(new KnnFloatVectorQuery("field", new float[] {120}, 10), 5); + assertEquals(5, docs.scoreDocs.length); } } } From 78aff8bb59943b8245370d721df9f54bb1837547 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Wed, 22 Jan 2025 08:31:12 -0500 Subject: [PATCH 42/88] move MultiLeafKnnCollector to decorator and remove unnecessary code (#14147) --- .../search/knn/MultiLeafKnnCollector.java | 34 ++----------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java b/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java index 051cd9ed6339..6b5e398d7087 100644 --- a/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java @@ -19,7 +19,6 @@ import org.apache.lucene.search.AbstractKnnCollector; import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.hnsw.BlockingFloatHeap; import org.apache.lucene.util.hnsw.FloatHeap; @@ -29,7 +28,7 @@ * * @lucene.experimental */ -public final class MultiLeafKnnCollector implements KnnCollector { +public final class MultiLeafKnnCollector extends KnnCollector.Decorator { // greediness of globally non-competitive search: (0,1] private static final float DEFAULT_GREEDINESS = 0.9f; @@ -77,6 +76,7 @@ public MultiLeafKnnCollector( int interval, BlockingFloatHeap globalSimilarityQueue, AbstractKnnCollector subCollector) { + super(subCollector); if (greediness < 0 || greediness > 1) { throw new IllegalArgumentException("greediness must be in [0,1]"); } @@ -91,31 +91,6 @@ public MultiLeafKnnCollector( this.updatesScratch = new float[k]; } - @Override - public boolean earlyTerminated() { - return subCollector.earlyTerminated(); - } - - @Override - public void incVisitedCount(int count) { - subCollector.incVisitedCount(count); - } - - @Override - public long visitedCount() { - return subCollector.visitedCount(); - } - - @Override - public long visitLimit() { - return subCollector.visitLimit(); - } - - @Override - public int k() { - return subCollector.k(); - } - @Override public boolean collect(int docId, float similarity) { boolean localSimUpdated = subCollector.collect(docId, similarity); @@ -157,11 +132,6 @@ public float minCompetitiveSimilarity() { Math.min(nonCompetitiveQueue.peek(), cachedGlobalMinSim)); } - @Override - public TopDocs topDocs() { - return subCollector.topDocs(); - } - @Override public String toString() { return "MultiLeafKnnCollector[subCollector=" + subCollector + "]"; From fb48d0da4991906c0a6598388c1ac29f3ace15a9 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Wed, 22 Jan 2025 08:40:47 -0500 Subject: [PATCH 43/88] gh-14127: remove duplicate neighbors when writing HNSW graphs (#14157) --- .../lucene99/Lucene99HnswVectorsWriter.java | 21 +++++++++++++------ .../org/apache/lucene/util/hnsw/HnswUtil.java | 1 + 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index a587449e2e7b..4983fdec6bff 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -408,6 +408,7 @@ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException { // write vectors' neighbours on each level into the vectorIndex file int countOnLevel0 = graph.size(); int[][] offsets = new int[graph.numLevels()][]; + int[] scratch = new int[graph.maxConn() * 2]; for (int level = 0; level < graph.numLevels(); level++) { int[] sortedNodes = NodesIterator.getSortedNodes(graph.getNodesOnLevel(level)); offsets[level] = new int[sortedNodes.length]; @@ -417,18 +418,26 @@ private int[][] writeGraph(OnHeapHnswGraph graph) throws IOException { int size = neighbors.size(); // Write size in VInt as the neighbors list is typically small long offsetStart = vectorIndex.getFilePointer(); - vectorIndex.writeVInt(size); - // Destructively modify; it's ok we are discarding it after this int[] nnodes = neighbors.nodes(); Arrays.sort(nnodes, 0, size); // Now that we have sorted, do delta encoding to minimize the required bits to store the // information - for (int i = size - 1; i > 0; --i) { + int actualSize = 0; + if (size > 0) { + scratch[0] = nnodes[0]; + actualSize = 1; + } + for (int i = 1; i < size; i++) { assert nnodes[i] < countOnLevel0 : "node too large: " + nnodes[i] + ">=" + countOnLevel0; - nnodes[i] -= nnodes[i - 1]; + if (nnodes[i - 1] == nnodes[i]) { + continue; + } + scratch[actualSize++] = nnodes[i] - nnodes[i - 1]; } - for (int i = 0; i < size; i++) { - vectorIndex.writeVInt(nnodes[i]); + // Write the size after duplicates are removed + vectorIndex.writeVInt(actualSize); + for (int i = 0; i < actualSize; i++) { + vectorIndex.writeVInt(scratch[i]); } offsets[level][nodeOffsetId++] = Math.toIntExact(vectorIndex.getFilePointer() - offsetStart); diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java index d0d398be2a78..9540a972fb20 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswUtil.java @@ -129,6 +129,7 @@ static List components( } Component component = markRooted(hnsw, level, connectedNodes, notFullyConnected, maxConn, nextClear); + assert component.start() == nextClear; assert component.size() > 0; components.add(component); total += component.size(); From 84877180df3553ef79f938dd1a51561d761b7719 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 23 Jan 2025 14:40:03 +0100 Subject: [PATCH 44/88] Add small bias towards bit set encoding. (#14155) Currently, blocks of postings get encoded as a bit set instead of packed deltas (FOR) whenever the bit set is more storage-efficient. However, the bit set approach is quite more CPU-efficient at search time, so this PR introduces a small bias towards the bit set encoding by using it as soon as it's more storage-efficient than FOR with the next number of bits per value. The impact on storage efficiency of the Wikipedia dataset is negligible (+0.15% on `.doc` files, while `.doc` files don't dominate storage requirements, positions do) while some queries get a good speedup. --- .../lucene101/Lucene101PostingsWriter.java | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java index 1cabefe681ef..3d19a69b82d8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java @@ -424,15 +424,17 @@ private void flushDocBlock(boolean finishTerm) throws IOException { long numSkipBytes = level0Output.size(); // Now we need to decide whether to encode block deltas as packed integers (FOR) or unary // codes (bit set). FOR makes #nextDoc() a bit faster while the bit set approach makes - // #advance() sometimes faster and #intoBitSet() much faster. Since the trade-off is not - // obvious, we make the decision purely based on storage efficiency, using the approach that - // requires fewer bits to encode the block. + // #advance() usually faster and #intoBitSet() much faster. In the end, we make the decision + // based on storage requirements, picking the bit set approach whenever it's more + // storage-efficient than the next number of bits per value (which effectively slightly biases + // towards the bit set approach). int bitsPerValue = forDeltaUtil.bitsRequired(docDeltaBuffer); int sum = Math.toIntExact(Arrays.stream(docDeltaBuffer).sum()); int numBitSetLongs = FixedBitSet.bits2words(sum); + int numBitsNextBitsPerValue = Math.min(Integer.SIZE, bitsPerValue + 1) * BLOCK_SIZE; if (sum == BLOCK_SIZE) { level0Output.writeByte((byte) 0); - } else if (version < VERSION_DENSE_BLOCKS_AS_BITSETS || bitsPerValue * BLOCK_SIZE < sum) { + } else if (version < VERSION_DENSE_BLOCKS_AS_BITSETS || numBitsNextBitsPerValue <= sum) { level0Output.writeByte((byte) bitsPerValue); forDeltaUtil.encodeDeltas(bitsPerValue, docDeltaBuffer, level0Output); } else { @@ -444,10 +446,9 @@ private void flushDocBlock(boolean finishTerm) throws IOException { s += i; spareBitSet.set(s); } - // Since we use the bit set encoding when it's more storage efficient than storing deltas, - // we know that each doc ID uses less than 32 bits, the maximum number of bits required to - // store a delta between consecutive doc IDs. So in the end, the bit set cannot have more - // than BLOCK_SIZE * Integer.SIZE / Long.SIZE = 64 longs, which fits on a byte. + // We never use the bit set encoding when it requires more than Integer.SIZE=32 bits per + // value. So the bit set cannot have more than BLOCK_SIZE * Integer.SIZE / Long.SIZE = 64 + // longs, which fits on a byte. assert numBitSetLongs <= BLOCK_SIZE / 2; level0Output.writeByte((byte) -numBitSetLongs); for (int i = 0; i < numBitSetLongs; ++i) { From 52d38094882784efb4cd7e01e96ac253c2ef6610 Mon Sep 17 00:00:00 2001 From: gf2121 <52390227+gf2121@users.noreply.github.com> Date: Sat, 25 Jan 2025 11:39:23 +0800 Subject: [PATCH 45/88] Not maintain docBufferUpTo when only docs needed (#14164) --- .../lucene101/Lucene101PostingsReader.java | 54 +++++++++++-------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java index b4ccff69fed9..04b39c23974b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java @@ -615,19 +615,21 @@ private void refillFullBlock() throws IOException { numLongs = -bitsPerValue; docIn.readLongs(docBitSet.getBits(), 0, numLongs); } - // Note: we know that BLOCK_SIZE bits are set, so no need to compute the cumulative pop - // count at the last index, it will be BLOCK_SIZE. - // Note: this for loop auto-vectorizes - for (int i = 0; i < numLongs - 1; ++i) { - docCumulativeWordPopCounts[i] = Long.bitCount(docBitSet.getBits()[i]); - } - for (int i = 1; i < numLongs - 1; ++i) { - docCumulativeWordPopCounts[i] += docCumulativeWordPopCounts[i - 1]; + if (needsFreq) { + // Note: we know that BLOCK_SIZE bits are set, so no need to compute the cumulative pop + // count at the last index, it will be BLOCK_SIZE. + // Note: this for loop auto-vectorizes + for (int i = 0; i < numLongs - 1; ++i) { + docCumulativeWordPopCounts[i] = Long.bitCount(docBitSet.getBits()[i]); + } + for (int i = 1; i < numLongs - 1; ++i) { + docCumulativeWordPopCounts[i] += docCumulativeWordPopCounts[i - 1]; + } + docCumulativeWordPopCounts[numLongs - 1] = BLOCK_SIZE; + assert docCumulativeWordPopCounts[numLongs - 2] + + Long.bitCount(docBitSet.getBits()[numLongs - 1]) + == BLOCK_SIZE; } - docCumulativeWordPopCounts[numLongs - 1] = BLOCK_SIZE; - assert docCumulativeWordPopCounts[numLongs - 2] - + Long.bitCount(docBitSet.getBits()[numLongs - 1]) - == BLOCK_SIZE; encoding = DeltaEncoding.UNARY; } if (indexHasFreq) { @@ -726,7 +728,7 @@ private void skipLevel1To(int target) throws IOException { } private void doMoveToNextLevel0Block() throws IOException { - assert docBufferUpto == BLOCK_SIZE; + assert doc == level0LastDocID; if (posIn != null) { if (level0PosEndFP >= posIn.getFilePointer()) { posIn.seek(level0PosEndFP); @@ -912,7 +914,7 @@ private void doAdvanceShallow(int target) throws IOException { @Override public int nextDoc() throws IOException { - if (docBufferUpto == BLOCK_SIZE) { + if (doc == level0LastDocID) { moveToNextLevel0Block(); } @@ -954,13 +956,21 @@ public int advance(int target) throws IOException { int next = docBitSet.nextSetBit(target - docBitSetBase); assert next != NO_MORE_DOCS; this.doc = docBitSetBase + next; - int wordIndex = next >> 6; - // Take the cumulative pop count for the given word, and subtract bits on the left of - // the current doc. - docBufferUpto = - 1 - + docCumulativeWordPopCounts[wordIndex] - - Long.bitCount(docBitSet.getBits()[wordIndex] >>> next); + if (needsFreq) { + int wordIndex = next >> 6; + // Take the cumulative pop count for the given word, and subtract bits on the left of + // the current doc. + docBufferUpto = + 1 + + docCumulativeWordPopCounts[wordIndex] + - Long.bitCount(docBitSet.getBits()[wordIndex] >>> next); + } else { + // When only docs needed and block is UNARY encoded, we do not need to maintain + // docBufferUpTo to record the iteration position in the block. + // docBufferUpTo == 0 means the block has not been iterated. + // docBufferUpTo != 0 means the block has been iterated. + docBufferUpto = 1; + } } break; } @@ -978,7 +988,7 @@ public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOExcept bitSet.set(doc - offset); for (; ; ) { - if (docBufferUpto == BLOCK_SIZE) { + if (doc == level0LastDocID) { // refill moveToNextLevel0Block(); } From 1cd77c2ca2a457e46b269d27084f618901020fe8 Mon Sep 17 00:00:00 2001 From: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com> Date: Sat, 25 Jan 2025 15:47:52 +0000 Subject: [PATCH 46/88] Avoid double buffering direct IO index input slices with BufferedIndexInput (#14103) This commit avoids double buffering direct IO index input slices with BufferedIndexInput. Currently BufferedIndexInput is used for slicing, since it will handle the initial offset and length, but this adds an extra layer of buffering - the buffer in buffered index input as well as the buffer in direct IO index input. This change reflows direct IO index input so that it can handle an offset and length, so can be its own implementation for slices. Existing tests covered this, but I found case where a clone of a slice was not covered. I added a small change to the base directory test case which covers this. --- .../lucene/misc/store/DirectIODirectory.java | 92 +++++++++++-------- .../tests/store/BaseDirectoryTestCase.java | 3 + 2 files changed, 55 insertions(+), 40 deletions(-) diff --git a/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java b/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java index 015d0ee13136..b56a206e60b1 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/store/DirectIODirectory.java @@ -28,6 +28,7 @@ import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.Arrays; +import java.util.Objects; import java.util.OptionalLong; import java.util.zip.CRC32; import java.util.zip.Checksum; @@ -298,9 +299,10 @@ private static final class DirectIOIndexInput extends IndexInput { private final ByteBuffer buffer; private final FileChannel channel; private final int blockSize; - + private final long offset; + private final long length; + private final boolean isClosable; // clones and slices are not closable private boolean isOpen; - private boolean isClone; private long filePos; /** @@ -313,31 +315,32 @@ private static final class DirectIOIndexInput extends IndexInput { */ public DirectIOIndexInput(Path path, int blockSize, int bufferSize) throws IOException { super("DirectIOIndexInput(path=\"" + path + "\")"); - this.blockSize = blockSize; - this.channel = FileChannel.open(path, StandardOpenOption.READ, getDirectOpenOption()); + this.blockSize = blockSize; this.buffer = allocateBuffer(bufferSize, blockSize); - - isOpen = true; - isClone = false; - filePos = -bufferSize; - buffer.limit(0); + this.isOpen = true; + this.isClosable = true; + this.length = channel.size(); + this.offset = 0L; + this.filePos = -bufferSize; + this.buffer.limit(0); } - // for clone - private DirectIOIndexInput(DirectIOIndexInput other) throws IOException { - super(other.toString()); - this.channel = other.channel; - this.blockSize = other.blockSize; - + // for clone/slice + private DirectIOIndexInput( + String description, DirectIOIndexInput other, long offset, long length) throws IOException { + super(description); + Objects.checkFromIndexSize(offset, length, other.channel.size()); final int bufferSize = other.buffer.capacity(); - this.buffer = allocateBuffer(bufferSize, blockSize); - - isOpen = true; - isClone = true; - filePos = -bufferSize; + this.buffer = allocateBuffer(bufferSize, other.blockSize); + this.blockSize = other.blockSize; + this.channel = other.channel; + this.isOpen = true; + this.isClosable = false; + this.length = length; + this.offset = offset; + this.filePos = -bufferSize; buffer.limit(0); - seek(other.getFilePointer()); } private static ByteBuffer allocateBuffer(int bufferSize, int blockSize) { @@ -348,20 +351,21 @@ private static ByteBuffer allocateBuffer(int bufferSize, int blockSize) { @Override public void close() throws IOException { - if (isOpen && !isClone) { + if (isOpen && isClosable) { channel.close(); + isOpen = false; } } @Override public long getFilePointer() { - long filePointer = filePos + buffer.position(); + long filePointer = filePos + buffer.position() - offset; // opening the input and immediately calling getFilePointer without calling readX (and thus // refill) first, // will result in negative value equal to bufferSize being returned, // due to the initialization method filePos = -bufferSize used in constructor. - assert filePointer == -buffer.capacity() || filePointer >= 0 + assert filePointer == -buffer.capacity() - offset || filePointer >= 0 : "filePointer should either be initial value equal to negative buffer capacity, or larger than or equal to 0"; return Math.max(filePointer, 0); } @@ -369,23 +373,24 @@ public long getFilePointer() { @Override public void seek(long pos) throws IOException { if (pos != getFilePointer()) { - final long alignedPos = pos - (pos % blockSize); - filePos = alignedPos - buffer.capacity(); - - final int delta = (int) (pos - alignedPos); - refill(delta); - buffer.position(delta); + seekInternal(pos); } assert pos == getFilePointer(); } + private void seekInternal(long pos) throws IOException { + final long absPos = pos + offset; + final long alignedPos = absPos - (absPos % blockSize); + filePos = alignedPos - buffer.capacity(); + + final int delta = (int) (absPos - alignedPos); + refill(delta); + buffer.position(delta); + } + @Override public long length() { - try { - return channel.size(); - } catch (IOException ioe) { - throw new UncheckedIOException(ioe); - } + return length; } @Override @@ -429,7 +434,7 @@ private void refill(int bytesToRead) throws IOException { // BaseDirectoryTestCase#testSeekPastEOF test for consecutive read past EOF, // hence throwing EOFException early to maintain buffer state (position in particular) - if (filePos > channel.size() || (channel.size() - filePos < bytesToRead)) { + if (filePos > offset + length || ((offset + length) - filePos < bytesToRead)) { throw new EOFException("read past EOF: " + this); } @@ -523,16 +528,23 @@ public void readLongs(long[] dst, int offset, int len) throws IOException { @Override public DirectIOIndexInput clone() { try { - return new DirectIOIndexInput(this); + var clone = new DirectIOIndexInput("clone:" + this, this, offset, length); + clone.seekInternal(getFilePointer()); + return clone; } catch (IOException ioe) { throw new UncheckedIOException(ioe); } } @Override - public IndexInput slice(String sliceDescription, long offset, long length) { - // TODO: is this the right thing to do? - return BufferedIndexInput.wrap(sliceDescription, this, offset, length); + public IndexInput slice(String sliceDescription, long offset, long length) throws IOException { + if ((length | offset) < 0 || length > this.length - offset) { + throw new IllegalArgumentException( + "slice() " + sliceDescription + " out of bounds: " + this); + } + var slice = new DirectIOIndexInput(sliceDescription, this, this.offset + offset, length); + slice.seekInternal(0L); + return slice; } } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java index 88223a4abebe..f00b3811d0c4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/store/BaseDirectoryTestCase.java @@ -1206,6 +1206,9 @@ public void testSliceOfSlice() throws Exception { slice1.seek(TestUtil.nextLong(random(), 0, slice1.length())); for (int j = 0; j < slice1.length(); j += 16) { IndexInput slice2 = slice1.slice("slice2", j, num - i - j); + if (random().nextBoolean()) { + slice2 = slice2.clone(); // clone shouldn't impact slice data + } assertEquals(0, slice2.getFilePointer()); assertEquals(num - i - j, slice2.length()); byte[] data = new byte[num]; From 93b996fe7d7183bce0a9f919eb43721a1ee8bbd2 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sat, 25 Jan 2025 15:59:37 +0000 Subject: [PATCH 47/88] Add changes for Optimize DirectIOIndexInput #14106 --- lucene/CHANGES.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 39d7403a11c4..28d8a7e72e90 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -65,6 +65,10 @@ Improvements BulkAdder#add(IntsRef) method. They should provide better performance due to less virtual method calls and more efficient bulk processing. (Ignacio Vera) +* GITHUB#14107, GITHUB#14124, GITHUB#14103: Optimize DirectIOIndexInput; add + individual and bulk data retrieval overloads; avoid double buffering with + slices. (Chris Hegarty) + Optimizations --------------------- From 63374b1633f518a504b580343434e0bce8e3f5a7 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Mon, 27 Jan 2025 10:49:08 +0000 Subject: [PATCH 48/88] fix maxDocs checks in CuVSIndex --- .../org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 6d2a4e281911..adb820bbb4c1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -48,7 +48,10 @@ public CuVSIndex( this.vectors = Objects.requireNonNull(vectors); this.fieldName = Objects.requireNonNull(fieldName); this.segmentName = Objects.requireNonNull(segmentName); - this.maxDocs = Objects.requireNonNull(maxDocs); + if (maxDocs < 0) { + throw new IllegalArgumentException("negative maxDocs:" +maxDocs); + } + this.maxDocs = maxDocs; } public CagraIndex getCagraIndex() { From f75c50e3316a1802e9c46f04c37126ca77a8c4ff Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Mon, 27 Jan 2025 10:50:22 +0000 Subject: [PATCH 49/88] tidy - just to remove noise --- .../vectorsearch/CagraFieldVectorsWriter.java | 4 +--- .../lucene/sandbox/vectorsearch/CuVSCodec.java | 4 +--- .../lucene/sandbox/vectorsearch/CuVSIndex.java | 6 ++---- .../vectorsearch/CuVSKnnFloatVectorQuery.java | 4 +--- .../lucene/sandbox/vectorsearch/CuVSSegmentFile.java | 4 +--- .../sandbox/vectorsearch/CuVSVectorsFormat.java | 4 +--- .../sandbox/vectorsearch/CuVSVectorsReader.java | 4 +--- .../sandbox/vectorsearch/CuVSVectorsWriter.java | 12 +++--------- .../vectorsearch/PerLeafCuVSKnnCollector.java | 4 +--- .../sandbox/vectorsearch/SegmentInputStream.java | 4 +--- .../org/apache/lucene/sandbox/vectorsearch/Util.java | 4 +--- .../lucene/sandbox/vectorsearch/package-info.java | 4 +--- 12 files changed, 15 insertions(+), 43 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index 6940b9bfeea6..de2c7315f033 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -22,9 +22,7 @@ import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.index.FieldInfo; -/** - * CuVS based fields writer - */ +/** CuVS based fields writer */ public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { public final String fieldName; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index 1e3c85d746ef..32ca1077887c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -24,9 +24,7 @@ import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; -/** - * CuVS based codec for GPU based vector search - */ +/** CuVS based codec for GPU based vector search */ public class CuVSCodec extends FilterCodec { public CuVSCodec() { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index adb820bbb4c1..9258a04fc5c2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -21,9 +21,7 @@ import java.util.List; import java.util.Objects; -/** - * This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) - */ +/** This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) */ public class CuVSIndex { private final CagraIndex cagraIndex; private final BruteForceIndex bruteforceIndex; @@ -49,7 +47,7 @@ public CuVSIndex( this.fieldName = Objects.requireNonNull(fieldName); this.segmentName = Objects.requireNonNull(segmentName); if (maxDocs < 0) { - throw new IllegalArgumentException("negative maxDocs:" +maxDocs); + throw new IllegalArgumentException("negative maxDocs:" + maxDocs); } this.maxDocs = maxDocs; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java index e4ce49fb84f7..2f6c636590ef 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -24,9 +24,7 @@ import org.apache.lucene.search.knn.KnnCollectorManager; import org.apache.lucene.util.Bits; -/** - * Query for CuVS - */ +/** Query for CuVS */ public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { private final int iTopK; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java index e6be4726f16e..ddbf8fc9d29e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -26,9 +26,7 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; -/** - * Methods to deal with a CuVS composite file inside a segment - */ +/** Methods to deal with a CuVS composite file inside a segment */ public class CuVSSegmentFile implements AutoCloseable { private final ZipOutputStream zos; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index e3928a31b050..d2f6c78417f5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -24,9 +24,7 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; -/** - * CuVS based KnnVectorsFormat for GPU acceleration - */ +/** CuVS based KnnVectorsFormat for GPU acceleration */ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index b41e5c08f177..d65dfbd288cc 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -51,9 +51,7 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; -/** - * KnnVectorsReader instance associated with CuVS format - */ +/** KnnVectorsReader instance associated with CuVS format */ public class CuVSVectorsReader extends KnnVectorsReader { // protected Logger log = Logger.getLogger(getClass().getName()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index bb40b7119a0e..2f595def8446 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -42,9 +42,7 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.SuppressForbidden; -/** - * KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU - */ +/** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ public class CuVSVectorsWriter extends KnnVectorsWriter { // protected Logger log = Logger.getLogger(getClass().getName()); @@ -63,9 +61,7 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { private MergeStrategy mergeStrategy; private CuVSResources resources; - /** - * Merge strategy used for CuVS - */ + /** Merge strategy used for CuVS */ public enum MergeStrategy { TRIVIAL_MERGE, NON_TRIVIAL_MERGE @@ -371,9 +367,7 @@ public void finish() throws IOException { } } - /** - * OutputStream for writing into an IndexOutput - */ + /** OutputStream for writing into an IndexOutput */ public class SegmentOutputStream extends OutputStream { IndexOutput out; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java index a1473c4acf20..ffba5f0c0f1f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -23,9 +23,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; -/** - * KnnCollector for CuVS - */ +/** KnnCollector for CuVS */ public class PerLeafCuVSKnnCollector implements KnnCollector { public List scoreDocs; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java index 47c6d3c3cedf..73fba879f6ad 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -20,9 +20,7 @@ import java.io.InputStream; import org.apache.lucene.store.IndexInput; -/** - * InputStream semantics for reading from an IndexInput - */ +/** InputStream semantics for reading from an IndexInput */ public class SegmentInputStream extends InputStream { /** */ diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java index dfe60b29ea27..35eaf35bc920 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -25,9 +25,7 @@ import java.util.zip.ZipInputStream; import org.apache.commons.lang3.SerializationUtils; -/** - * Some Utils used in CuVS integration - */ +/** Some Utils used in CuVS integration */ public class Util { public static ByteArrayOutputStream getZipEntryBAOS( diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java index a11c94e7224b..86c56b909dd1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -15,7 +15,5 @@ * limitations under the License. */ -/** - * CuVS based fast vector search - */ +/** CuVS based fast vector search */ package org.apache.lucene.sandbox.vectorsearch; From f315f53acab338e92a14b073efcadcb5ce8d5435 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Mon, 27 Jan 2025 07:09:41 -0500 Subject: [PATCH 50/88] Optimize ContextQuery with big number of contexts (#14169) When there are big number of contexts, ContextQuery may take a lot of time because of how context automata are constructed. Instead of the currentt appraoch of repeatedly concatenating and unioning context automata, this PR first constucts all individual context automata and then does one single union at the end. Thus for the added test with 1000 contexts, the performance improved from 4000 ms to 18 ms. --- lucene/CHANGES.txt | 2 ++ .../search/suggest/document/ContextQuery.java | 11 +++---- .../suggest/document/TestContextQuery.java | 32 +++++++++++++++++++ 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 28d8a7e72e90..b36a48c2fd00 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -80,6 +80,8 @@ Optimizations * GITHUB#14133: Dense blocks of postings are now encoded as bit sets. (Adrien Grand) +# GITHUB#14169: Optimize ContextQuery with big number of contexts. (Mayya Sharipova) + Bug Fixes --------------------- diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java index bbcfc7feb439..496d3b9232dc 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/ContextQuery.java @@ -17,8 +17,10 @@ package org.apache.lucene.search.suggest.document; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; import org.apache.lucene.internal.hppc.IntHashSet; @@ -230,7 +232,7 @@ private static Automaton toContextAutomaton( if (matchAllContexts || contexts.size() == 0) { return Operations.concatenate(matchAllAutomaton, sep); } else { - Automaton contextsAutomaton = null; + List automataList = new ArrayList<>(); for (Map.Entry entry : contexts.entrySet()) { final ContextMetaData contextMetaData = entry.getValue(); final IntsRef ref = entry.getKey(); @@ -239,12 +241,9 @@ private static Automaton toContextAutomaton( contextAutomaton = Operations.concatenate(contextAutomaton, matchAllAutomaton); } contextAutomaton = Operations.concatenate(contextAutomaton, sep); - if (contextsAutomaton == null) { - contextsAutomaton = contextAutomaton; - } else { - contextsAutomaton = Operations.union(contextsAutomaton, contextAutomaton); - } + automataList.add(contextAutomaton); } + Automaton contextsAutomaton = Operations.union(automataList); return contextsAutomaton; } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java index 01eb834a6b0b..cf7d5f8e745b 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestContextQuery.java @@ -468,6 +468,38 @@ public void testMultiContextQuery() throws Exception { iw.close(); } + @Test + public void testBigNumberOfContextsQuery() throws Exception { + Analyzer analyzer = new MockAnalyzer(random()); + RandomIndexWriter iw = + new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field")); + for (int i = 1; i < 1001; i++) { + Document document = new Document(); + document.add( + new ContextSuggestField("suggest_field", "suggestion" + i, 1001 - i, "group" + i)); + iw.addDocument(document); + } + iw.commit(); + + DirectoryReader reader = iw.getReader(); + SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader); + ContextQuery query = + new ContextQuery(new PrefixCompletionQuery(analyzer, new Term("suggest_field", "sugg"))); + for (int i = 1; i < 1001; i++) { + query.addContext("group" + i, 1); + } + TopSuggestDocs suggest = suggestIndexSearcher.suggest(query, 5, false); + assertSuggestions( + suggest, + new Entry("suggestion1", "group1", 1000), + new Entry("suggestion2", "group2", 999), + new Entry("suggestion3", "group3", 998), + new Entry("suggestion4", "group4", 997), + new Entry("suggestion5", "group5", 996)); + reader.close(); + iw.close(); + } + @Test public void testAllContextQuery() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); From 118afb7054fcc53acc089d02853a0d041072c224 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Mon, 27 Jan 2025 05:20:02 -0800 Subject: [PATCH 51/88] Upgrade commons-codec from 1.13.0 to 1.17.2 (#14129) --- lucene/analysis.tests/src/test/module-info.java | 1 + lucene/licenses/commons-codec-1.13.jar.sha1 | 1 - lucene/licenses/commons-codec-1.17.2.jar.sha1 | 1 + versions.lock | 4 ++-- versions.toml | 2 +- 5 files changed, 5 insertions(+), 4 deletions(-) delete mode 100644 lucene/licenses/commons-codec-1.13.jar.sha1 create mode 100644 lucene/licenses/commons-codec-1.17.2.jar.sha1 diff --git a/lucene/analysis.tests/src/test/module-info.java b/lucene/analysis.tests/src/test/module-info.java index 3a67c75febb0..d4d8957252b2 100644 --- a/lucene/analysis.tests/src/test/module-info.java +++ b/lucene/analysis.tests/src/test/module-info.java @@ -33,6 +33,7 @@ requires org.apache.lucene.analysis.smartcn; requires org.apache.lucene.analysis.stempel; requires org.apache.lucene.test_framework; + requires org.apache.commons.codec; exports org.apache.lucene.analysis.tests; } diff --git a/lucene/licenses/commons-codec-1.13.jar.sha1 b/lucene/licenses/commons-codec-1.13.jar.sha1 deleted file mode 100644 index 4d9344b4a4e6..000000000000 --- a/lucene/licenses/commons-codec-1.13.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -3f18e1aa31031d89db6f01ba05d501258ce69d2c diff --git a/lucene/licenses/commons-codec-1.17.2.jar.sha1 b/lucene/licenses/commons-codec-1.17.2.jar.sha1 new file mode 100644 index 000000000000..3ef561c0262f --- /dev/null +++ b/lucene/licenses/commons-codec-1.17.2.jar.sha1 @@ -0,0 +1 @@ +cd6bb9d856db5f61871a94d5801efd0b93b7fcb2 diff --git a/versions.lock b/versions.lock index 26de44f99e2d..f3057288a9f7 100644 --- a/versions.lock +++ b/versions.lock @@ -4,7 +4,7 @@ "main_dependencies" : { "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1" : "fa9ef26b,refs=4", "com.ibm.icu:icu4j:74.2" : "47ea4550,refs=6", - "commons-codec:commons-codec:1.13" : "e9962aab,refs=4", + "commons-codec:commons-codec:1.17.2" : "e9962aab,refs=4", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.1" : "fa9ef26b,refs=4", "net.sf.jopt-simple:jopt-simple:5.0.4" : "85a1e4c6,refs=2", @@ -46,7 +46,7 @@ "com.google.j2objc:j2objc-annotations:1.3" : "6897bc09,refs=38", "com.google.protobuf:protobuf-java:3.19.2" : "6897bc09,refs=38", "com.ibm.icu:icu4j:74.2" : "ffa00415,refs=8", - "commons-codec:commons-codec:1.13" : "733734f0,refs=6", + "commons-codec:commons-codec:1.17.2" : "733734f0,refs=6", "io.github.java-diff-utils:java-diff-utils:4.0" : "6897bc09,refs=38", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", "javax.inject:javax.inject:1" : "6897bc09,refs=38", diff --git a/versions.toml b/versions.toml index 80dc51f39bf2..d1e693e03ca3 100644 --- a/versions.toml +++ b/versions.toml @@ -2,7 +2,7 @@ antlr = "4.11.1" asm = "9.6" assertj = "3.21.0" -commons-codec = "1.13" +commons-codec = "1.17.2" commons-compress = "1.19" ecj = "3.36.0" errorprone = "2.18.0" From cab88cafb1715c8844b69e366b7c8558c3c5f7d0 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Tue, 28 Jan 2025 08:34:41 -0500 Subject: [PATCH 52/88] Make knn graph conn writing more consistent (#14174) * Make graph writing more consistent * correct concurrent connected components logic --- .../lucene99/Lucene99HnswVectorsWriter.java | 29 ++++++++++++------- .../util/hnsw/HnswConcurrentMergeBuilder.java | 4 +-- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index 4983fdec6bff..e219157ab986 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -243,13 +243,14 @@ private HnswGraph reconstructAndWriteGraph( nodesByLevel.add(null); int maxOrd = graph.size(); + int[] scratch = new int[graph.maxConn() * 2]; NodesIterator nodesOnLevel0 = graph.getNodesOnLevel(0); levelNodeOffsets[0] = new int[nodesOnLevel0.size()]; while (nodesOnLevel0.hasNext()) { int node = nodesOnLevel0.nextInt(); NeighborArray neighbors = graph.getNeighbors(0, newToOldMap[node]); long offset = vectorIndex.getFilePointer(); - reconstructAndWriteNeighbours(neighbors, oldToNewMap, maxOrd); + reconstructAndWriteNeighbours(neighbors, oldToNewMap, scratch, maxOrd); levelNodeOffsets[0][node] = Math.toIntExact(vectorIndex.getFilePointer() - offset); } @@ -266,7 +267,7 @@ private HnswGraph reconstructAndWriteGraph( for (int node : newNodes) { NeighborArray neighbors = graph.getNeighbors(level, newToOldMap[node]); long offset = vectorIndex.getFilePointer(); - reconstructAndWriteNeighbours(neighbors, oldToNewMap, maxOrd); + reconstructAndWriteNeighbours(neighbors, oldToNewMap, scratch, maxOrd); levelNodeOffsets[level][nodeOffsetIndex++] = Math.toIntExact(vectorIndex.getFilePointer() - offset); } @@ -313,25 +314,33 @@ public NodesIterator getNodesOnLevel(int level) { }; } - private void reconstructAndWriteNeighbours(NeighborArray neighbors, int[] oldToNewMap, int maxOrd) - throws IOException { + private void reconstructAndWriteNeighbours( + NeighborArray neighbors, int[] oldToNewMap, int[] scratch, int maxOrd) throws IOException { int size = neighbors.size(); - vectorIndex.writeVInt(size); - // Destructively modify; it's ok we are discarding it after this int[] nnodes = neighbors.nodes(); for (int i = 0; i < size; i++) { nnodes[i] = oldToNewMap[nnodes[i]]; } Arrays.sort(nnodes, 0, size); + int actualSize = 0; + if (size > 0) { + scratch[0] = nnodes[0]; + actualSize = 1; + } // Now that we have sorted, do delta encoding to minimize the required bits to store the // information - for (int i = size - 1; i > 0; --i) { + for (int i = 1; i < size; i++) { assert nnodes[i] < maxOrd : "node too large: " + nnodes[i] + ">=" + maxOrd; - nnodes[i] -= nnodes[i - 1]; + if (nnodes[i - 1] == nnodes[i]) { + continue; + } + scratch[actualSize++] = nnodes[i] - nnodes[i - 1]; } - for (int i = 0; i < size; i++) { - vectorIndex.writeVInt(nnodes[i]); + // Write the size after duplicates are removed + vectorIndex.writeVInt(actualSize); + for (int i = 0; i < actualSize; i++) { + vectorIndex.writeVInt(scratch[i]); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java index d2e81addc5d4..d9d58c829d3d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/HnswConcurrentMergeBuilder.java @@ -90,9 +90,7 @@ public OnHeapHnswGraph build(int maxOrd) throws IOException { }); } taskExecutor.invokeAll(futures); - finish(); - frozen = true; - return workers[0].getCompletedGraph(); + return getCompletedGraph(); } @Override From 71256cced2e4e4bae67ccca3159d50099e7eb6f5 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 28 Jan 2025 15:16:30 +0100 Subject: [PATCH 53/88] Specialize DisiPriorityQueue for the 2-clauses case. (#14070) Disjunctions with 2 clauses are rather common. Specializing this case enables some shortcuts. --- .../lucene/search/DisiPriorityQueue.java | 216 +++------------- .../lucene/search/DisiPriorityQueue2.java | 110 +++++++++ .../lucene/search/DisiPriorityQueueN.java | 230 ++++++++++++++++++ .../search/DisjunctionDISIApproximation.java | 2 +- .../lucene/search/DocIdSetBulkIterator.java | 34 +++ .../lucene/search/MaxScoreBulkScorer.java | 2 +- .../org/apache/lucene/search/WANDScorer.java | 8 +- .../lucene/search/TestDisiPriorityQueue.java | 38 ++- .../lucene/sandbox/search/CoveringScorer.java | 2 +- 9 files changed, 448 insertions(+), 194 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java create mode 100644 lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java index 034f46ed93f9..d6bdf82e48d0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.search; -import java.util.Arrays; -import java.util.Iterator; import org.apache.lucene.util.PriorityQueue; /** @@ -27,205 +25,51 @@ * * @lucene.internal */ -public final class DisiPriorityQueue implements Iterable { - - static int leftNode(int node) { - return ((node + 1) << 1) - 1; - } - - static int rightNode(int leftNode) { - return leftNode + 1; - } - - static int parentNode(int node) { - return ((node + 1) >>> 1) - 1; +public abstract sealed class DisiPriorityQueue implements Iterable + permits DisiPriorityQueue2, DisiPriorityQueueN { + + /** Create a {@link DisiPriorityQueue} of the given maximum size. */ + public static DisiPriorityQueue ofMaxSize(int maxSize) { + if (maxSize <= 2) { + return new DisiPriorityQueue2(); + } else { + return new DisiPriorityQueueN(maxSize); + } } - private final DisiWrapper[] heap; - private int size; + /** Return the number of entries in this heap. */ + public abstract int size(); - public DisiPriorityQueue(int maxSize) { - heap = new DisiWrapper[maxSize]; - size = 0; - } - - public int size() { - return size; - } - - public DisiWrapper top() { - return heap[0]; - } + /** Return top value in this heap, or null if the heap is empty. */ + public abstract DisiWrapper top(); /** Return the 2nd least value in this heap, or null if the heap contains less than 2 values. */ - public DisiWrapper top2() { - switch (size()) { - case 0: - case 1: - return null; - case 2: - return heap[1]; - default: - if (heap[1].doc <= heap[2].doc) { - return heap[1]; - } else { - return heap[2]; - } - } - } + public abstract DisiWrapper top2(); /** Get the list of scorers which are on the current doc. */ - public DisiWrapper topList() { - final DisiWrapper[] heap = this.heap; - final int size = this.size; - DisiWrapper list = heap[0]; - list.next = null; - if (size >= 3) { - list = topList(list, heap, size, 1); - list = topList(list, heap, size, 2); - } else if (size == 2 && heap[1].doc == list.doc) { - list = prepend(heap[1], list); - } - return list; - } - - // prepend w1 (iterator) to w2 (list) - private DisiWrapper prepend(DisiWrapper w1, DisiWrapper w2) { - w1.next = w2; - return w1; - } - - private DisiWrapper topList(DisiWrapper list, DisiWrapper[] heap, int size, int i) { - final DisiWrapper w = heap[i]; - if (w.doc == list.doc) { - list = prepend(w, list); - final int left = leftNode(i); - final int right = left + 1; - if (right < size) { - list = topList(list, heap, size, left); - list = topList(list, heap, size, right); - } else if (left < size && heap[left].doc == list.doc) { - list = prepend(heap[left], list); - } - } - return list; - } + public abstract DisiWrapper topList(); - public DisiWrapper add(DisiWrapper entry) { - final DisiWrapper[] heap = this.heap; - final int size = this.size; - heap[size] = entry; - upHeap(size); - this.size = size + 1; - return heap[0]; - } + /** Add a {@link DisiWrapper} to this queue and return the top entry. */ + public abstract DisiWrapper add(DisiWrapper entry); + /** Bulk add. */ public void addAll(DisiWrapper[] entries, int offset, int len) { - // Nothing to do if empty: - if (len == 0) { - return; - } - - // Fail early if we're going to over-fill: - if (size + len > heap.length) { - throw new IndexOutOfBoundsException( - "Cannot add " - + len - + " elements to a queue with remaining capacity " - + (heap.length - size)); - } - - // Copy the entries over to our heap array: - System.arraycopy(entries, offset, heap, size, len); - size += len; - - // Heapify in bulk: - final int firstLeafIndex = size >>> 1; - for (int rootIndex = firstLeafIndex - 1; rootIndex >= 0; rootIndex--) { - int parentIndex = rootIndex; - DisiWrapper parent = heap[parentIndex]; - while (parentIndex < firstLeafIndex) { - int childIndex = leftNode(parentIndex); - int rightChildIndex = rightNode(childIndex); - DisiWrapper child = heap[childIndex]; - if (rightChildIndex < size && heap[rightChildIndex].doc < child.doc) { - child = heap[rightChildIndex]; - childIndex = rightChildIndex; - } - if (child.doc >= parent.doc) { - break; - } - heap[parentIndex] = child; - parentIndex = childIndex; - } - heap[parentIndex] = parent; + for (int i = 0; i < len; ++i) { + add(entries[offset + i]); } } - public DisiWrapper pop() { - final DisiWrapper[] heap = this.heap; - final DisiWrapper result = heap[0]; - final int i = --size; - heap[0] = heap[i]; - heap[i] = null; - downHeap(i); - return result; - } + /** Remove the top entry and return it. */ + public abstract DisiWrapper pop(); - public DisiWrapper updateTop() { - downHeap(size); - return heap[0]; - } + /** Rebalance this heap and return the top entry. */ + public abstract DisiWrapper updateTop(); - DisiWrapper updateTop(DisiWrapper topReplacement) { - heap[0] = topReplacement; - return updateTop(); - } + /** + * Replace the top entry with the given entry, rebalance the heap, and return the new top entry. + */ + abstract DisiWrapper updateTop(DisiWrapper topReplacement); /** Clear the heap. */ - public void clear() { - Arrays.fill(heap, null); - size = 0; - } - - void upHeap(int i) { - final DisiWrapper node = heap[i]; - final int nodeDoc = node.doc; - int j = parentNode(i); - while (j >= 0 && nodeDoc < heap[j].doc) { - heap[i] = heap[j]; - i = j; - j = parentNode(j); - } - heap[i] = node; - } - - void downHeap(int size) { - int i = 0; - final DisiWrapper node = heap[0]; - int j = leftNode(i); - if (j < size) { - int k = rightNode(j); - if (k < size && heap[k].doc < heap[j].doc) { - j = k; - } - if (heap[j].doc < node.doc) { - do { - heap[i] = heap[j]; - i = j; - j = leftNode(i); - k = rightNode(j); - if (k < size && heap[k].doc < heap[j].doc) { - j = k; - } - } while (j < size && heap[j].doc < node.doc); - heap[i] = node; - } - } - } - - @Override - public Iterator iterator() { - return Arrays.asList(heap).subList(0, size).iterator(); - } + public abstract void clear(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java new file mode 100644 index 000000000000..b7e587382db7 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueue2.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; + +/** {@link DisiPriorityQueue} of two entries or less. */ +final class DisiPriorityQueue2 extends DisiPriorityQueue { + + private DisiWrapper top, top2; + + @Override + public Iterator iterator() { + if (top2 != null) { + return Arrays.asList(top, top2).iterator(); + } else if (top != null) { + return Collections.singleton(top).iterator(); + } else { + return Collections.emptyIterator(); + } + } + + @Override + public int size() { + return top2 == null ? (top == null ? 0 : 1) : 2; + } + + @Override + public DisiWrapper top() { + return top; + } + + @Override + public DisiWrapper top2() { + return top2; + } + + @Override + public DisiWrapper topList() { + DisiWrapper topList = null; + if (top != null) { + top.next = null; + topList = top; + if (top2 != null && top.doc == top2.doc) { + top2.next = topList; + topList = top2; + } + } + return topList; + } + + @Override + public DisiWrapper add(DisiWrapper entry) { + if (top == null) { + return top = entry; + } else if (top2 == null) { + top2 = entry; + return updateTop(); + } else { + throw new IllegalStateException( + "Trying to add a 3rd element to a DisiPriorityQueue configured with a max size of 2"); + } + } + + @Override + public DisiWrapper pop() { + DisiWrapper ret = top; + top = top2; + top2 = null; + return ret; + } + + @Override + public DisiWrapper updateTop() { + if (top2 != null && top2.doc < top.doc) { + DisiWrapper tmp = top; + top = top2; + top2 = tmp; + } + return top; + } + + @Override + DisiWrapper updateTop(DisiWrapper topReplacement) { + top = topReplacement; + return updateTop(); + } + + @Override + public void clear() { + top = null; + top2 = null; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java new file mode 100644 index 000000000000..b841c3ef0ef1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DisiPriorityQueueN.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.util.Arrays; +import java.util.Iterator; + +final class DisiPriorityQueueN extends DisiPriorityQueue { + + static int leftNode(int node) { + return ((node + 1) << 1) - 1; + } + + static int rightNode(int leftNode) { + return leftNode + 1; + } + + static int parentNode(int node) { + return ((node + 1) >>> 1) - 1; + } + + private final DisiWrapper[] heap; + private int size; + + DisiPriorityQueueN(int maxSize) { + heap = new DisiWrapper[maxSize]; + size = 0; + } + + @Override + public int size() { + return size; + } + + @Override + public DisiWrapper top() { + return heap[0]; + } + + @Override + public DisiWrapper top2() { + switch (size()) { + case 0: + case 1: + return null; + case 2: + return heap[1]; + default: + if (heap[1].doc <= heap[2].doc) { + return heap[1]; + } else { + return heap[2]; + } + } + } + + @Override + public DisiWrapper topList() { + final DisiWrapper[] heap = this.heap; + final int size = this.size; + DisiWrapper list = heap[0]; + list.next = null; + if (size >= 3) { + list = topList(list, heap, size, 1); + list = topList(list, heap, size, 2); + } else if (size == 2 && heap[1].doc == list.doc) { + list = prepend(heap[1], list); + } + return list; + } + + // prepend w1 (iterator) to w2 (list) + private DisiWrapper prepend(DisiWrapper w1, DisiWrapper w2) { + w1.next = w2; + return w1; + } + + private DisiWrapper topList(DisiWrapper list, DisiWrapper[] heap, int size, int i) { + final DisiWrapper w = heap[i]; + if (w.doc == list.doc) { + list = prepend(w, list); + final int left = leftNode(i); + final int right = rightNode(left); + if (right < size) { + list = topList(list, heap, size, left); + list = topList(list, heap, size, right); + } else if (left < size && heap[left].doc == list.doc) { + list = prepend(heap[left], list); + } + } + return list; + } + + @Override + public DisiWrapper add(DisiWrapper entry) { + final DisiWrapper[] heap = this.heap; + final int size = this.size; + heap[size] = entry; + upHeap(size); + this.size = size + 1; + return heap[0]; + } + + @Override + public void addAll(DisiWrapper[] entries, int offset, int len) { + // Nothing to do if empty: + if (len == 0) { + return; + } + + // Fail early if we're going to over-fill: + if (size + len > heap.length) { + throw new IndexOutOfBoundsException( + "Cannot add " + + len + + " elements to a queue with remaining capacity " + + (heap.length - size)); + } + + // Copy the entries over to our heap array: + System.arraycopy(entries, offset, heap, size, len); + size += len; + + // Heapify in bulk: + final int firstLeafIndex = size >>> 1; + for (int rootIndex = firstLeafIndex - 1; rootIndex >= 0; rootIndex--) { + int parentIndex = rootIndex; + DisiWrapper parent = heap[parentIndex]; + while (parentIndex < firstLeafIndex) { + int childIndex = leftNode(parentIndex); + int rightChildIndex = rightNode(childIndex); + DisiWrapper child = heap[childIndex]; + if (rightChildIndex < size && heap[rightChildIndex].doc < child.doc) { + child = heap[rightChildIndex]; + childIndex = rightChildIndex; + } + if (child.doc >= parent.doc) { + break; + } + heap[parentIndex] = child; + parentIndex = childIndex; + } + heap[parentIndex] = parent; + } + } + + @Override + public DisiWrapper pop() { + final DisiWrapper[] heap = this.heap; + final DisiWrapper result = heap[0]; + final int i = --size; + heap[0] = heap[i]; + heap[i] = null; + downHeap(i); + return result; + } + + @Override + public DisiWrapper updateTop() { + downHeap(size); + return heap[0]; + } + + @Override + DisiWrapper updateTop(DisiWrapper topReplacement) { + heap[0] = topReplacement; + return updateTop(); + } + + @Override + public void clear() { + Arrays.fill(heap, null); + size = 0; + } + + void upHeap(int i) { + final DisiWrapper node = heap[i]; + final int nodeDoc = node.doc; + int j = parentNode(i); + while (j >= 0 && nodeDoc < heap[j].doc) { + heap[i] = heap[j]; + i = j; + j = parentNode(j); + } + heap[i] = node; + } + + void downHeap(int size) { + int i = 0; + final DisiWrapper node = heap[0]; + int j = leftNode(i); + if (j < size) { + int k = rightNode(j); + if (k < size && heap[k].doc < heap[j].doc) { + j = k; + } + if (heap[j].doc < node.doc) { + do { + heap[i] = heap[j]; + i = j; + j = leftNode(i); + k = rightNode(j); + if (k < size && heap[k].doc < heap[j].doc) { + j = k; + } + } while (j < size && heap[j].doc < node.doc); + heap[i] = node; + } + } + } + + @Override + public Iterator iterator() { + return Arrays.asList(heap).subList(0, size).iterator(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java index cedababbce6b..08018dacf9b8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionDISIApproximation.java @@ -91,7 +91,7 @@ public DisjunctionDISIApproximation( // Build the PQ: assert lastIdx >= -1 && lastIdx < wrappers.length - 1; int pqLen = wrappers.length - lastIdx - 1; - leadIterators = new DisiPriorityQueue(pqLen); + leadIterators = DisiPriorityQueue.ofMaxSize(pqLen); leadIterators.addAll(wrappers, lastIdx + 1, pqLen); // Build the non-PQ list: diff --git a/lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java new file mode 100644 index 000000000000..87912beecccb --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/DocIdSetBulkIterator.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import org.apache.lucene.util.Bits; + +/** Bulk iterator over a {@link DocIdSetIterator}. */ +public abstract class DocIdSetBulkIterator { + + /** Sole constructor, invoked by sub-classes. */ + protected DocIdSetBulkIterator() {} + + /** + * Iterate over documents contained in this iterator and call {@link LeafCollector#collect} on + * them. + */ + public abstract void iterate(LeafCollector collector, Bits acceptDocs, int min, int max) + throws IOException; +} diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java index 93dd1ea91e31..30b1d4b7e5a8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java @@ -64,7 +64,7 @@ final class MaxScoreBulkScorer extends BulkScorer { allScorers[i++] = w; } this.cost = cost; - essentialQueue = new DisiPriorityQueue(allScorers.length); + essentialQueue = DisiPriorityQueue.ofMaxSize(allScorers.length); maxScoreSums = new double[allScorers.length]; } diff --git a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java index 897713dbe17d..88ffa4a0c62e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/WANDScorer.java @@ -16,9 +16,9 @@ */ package org.apache.lucene.search; -import static org.apache.lucene.search.DisiPriorityQueue.leftNode; -import static org.apache.lucene.search.DisiPriorityQueue.parentNode; -import static org.apache.lucene.search.DisiPriorityQueue.rightNode; +import static org.apache.lucene.search.DisiPriorityQueueN.leftNode; +import static org.apache.lucene.search.DisiPriorityQueueN.parentNode; +import static org.apache.lucene.search.DisiPriorityQueueN.rightNode; import static org.apache.lucene.search.ScorerUtil.costWithMinShouldMatch; import java.io.IOException; @@ -170,7 +170,7 @@ private static long scaleMinScore(float minScore, int scalingFactor) { this.scoreMode = scoreMode; - head = new DisiPriorityQueue(scorers.size()); + head = DisiPriorityQueue.ofMaxSize(scorers.size()); // there can be at most num_scorers - 1 scorers beyond the current position tail = new DisiWrapper[scorers.size()]; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java b/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java index fb7afac8ba47..967c5a34d7dc 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDisiPriorityQueue.java @@ -26,6 +26,42 @@ public class TestDisiPriorityQueue extends LuceneTestCase { + public void testDisiPriorityQueue2() throws IOException { + Random r = random(); + DisiWrapper w1 = wrapper(randomDisi(r)); + DisiWrapper w2 = wrapper(randomDisi(r)); + DisiWrapper w3 = wrapper(randomDisi(r)); + + DisiPriorityQueue pq = DisiPriorityQueue.ofMaxSize(2); + w1.doc = 1; + w2.doc = 0; + assertNull(pq.top()); + assertEquals(0, pq.size()); + assertSame(w1, pq.add(w1)); + assertSame(w1, pq.top()); + assertEquals(1, pq.size()); + assertSame(w2, pq.add(w2)); + assertSame(w2, pq.top()); + assertEquals(2, pq.size()); + expectThrows(IllegalStateException.class, () -> pq.add(w3)); + + w2.doc = 1; + assertSame(w2, pq.updateTop()); + DisiWrapper topList = pq.topList(); + assertSame(w1, topList); + assertSame(w2, topList.next); + assertNull(topList.next.next); + + w2.doc = 2; + assertSame(w1, pq.updateTop()); + topList = pq.topList(); + assertSame(w1, topList); + assertNull(topList.next); + + assertSame(w1, pq.pop()); + assertSame(w2, pq.top()); + } + public void testRandom() throws Exception { Random r = random(); @@ -37,7 +73,7 @@ public void testRandom() throws Exception { all[i] = w; } - DisiPriorityQueue pq = new DisiPriorityQueue(size); + DisiPriorityQueue pq = DisiPriorityQueue.ofMaxSize(size); if (r.nextBoolean()) { for (DisiWrapper w : all) { pq.add(w); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java index dfedb51ed1f4..09e2bb57af7a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/CoveringScorer.java @@ -51,7 +51,7 @@ final class CoveringScorer extends Scorer { this.minMatchValues = minMatchValues; this.doc = -1; - subScorers = new DisiPriorityQueue(scorers.size()); + subScorers = DisiPriorityQueue.ofMaxSize(scorers.size()); for (Scorer scorer : scorers) { subScorers.add(new DisiWrapper(scorer, false)); From a7b7f0d6583c5532337320efee71d4797f473b60 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Tue, 28 Jan 2025 14:31:14 -0800 Subject: [PATCH 54/88] Upgrade OpenNLP from 2.3.2 to 2.5.3 (#14130) --- lucene/CHANGES.txt | 3 +++ .../lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java | 6 +++--- lucene/licenses/opennlp-tools-2.3.2.jar.sha1 | 1 - lucene/licenses/opennlp-tools-2.5.3.jar.sha1 | 1 + lucene/licenses/slf4j-api-1.7.36.jar.sha1 | 1 - lucene/licenses/slf4j-api-2.0.16.jar.sha1 | 1 + versions.lock | 8 ++++---- versions.toml | 2 +- 8 files changed, 13 insertions(+), 10 deletions(-) delete mode 100644 lucene/licenses/opennlp-tools-2.3.2.jar.sha1 create mode 100644 lucene/licenses/opennlp-tools-2.5.3.jar.sha1 delete mode 100644 lucene/licenses/slf4j-api-1.7.36.jar.sha1 create mode 100644 lucene/licenses/slf4j-api-2.0.16.jar.sha1 diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b36a48c2fd00..fb9e7665a9e1 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -102,6 +102,9 @@ Other * GITHUB#14091: Cover all DataType. (Lu Xugang) +* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j + from 1.7.36 to 2.0.16. (Michael Froh) + ======================= Lucene 10.1.0 ======================= API Changes diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java index dee4afefc58a..ef7a6fb62452 100644 --- a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java +++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java @@ -17,8 +17,8 @@ package org.apache.lucene.analysis.opennlp.tools; -import java.io.IOException; import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSTagFormat; import opennlp.tools.postag.POSTagger; import opennlp.tools.postag.POSTaggerME; @@ -29,8 +29,8 @@ public class NLPPOSTaggerOp { private final POSTagger tagger; - public NLPPOSTaggerOp(POSModel model) throws IOException { - tagger = new POSTaggerME(model); + public NLPPOSTaggerOp(POSModel model) { + tagger = new POSTaggerME(model, POSTagFormat.PENN); } public synchronized String[] getPOSTags(String[] words) { diff --git a/lucene/licenses/opennlp-tools-2.3.2.jar.sha1 b/lucene/licenses/opennlp-tools-2.3.2.jar.sha1 deleted file mode 100644 index 94b2924f8fa7..000000000000 --- a/lucene/licenses/opennlp-tools-2.3.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -d739edba1e729691ed5ab80e1ccf330555a02ea7 diff --git a/lucene/licenses/opennlp-tools-2.5.3.jar.sha1 b/lucene/licenses/opennlp-tools-2.5.3.jar.sha1 new file mode 100644 index 000000000000..fb01299fa29d --- /dev/null +++ b/lucene/licenses/opennlp-tools-2.5.3.jar.sha1 @@ -0,0 +1 @@ +4b544138ec079c1c73dc2c1b928506871c4b1b47 diff --git a/lucene/licenses/slf4j-api-1.7.36.jar.sha1 b/lucene/licenses/slf4j-api-1.7.36.jar.sha1 deleted file mode 100644 index 828b7cf7e056..000000000000 --- a/lucene/licenses/slf4j-api-1.7.36.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -6c62681a2f655b49963a5983b8b0950a6120ae14 diff --git a/lucene/licenses/slf4j-api-2.0.16.jar.sha1 b/lucene/licenses/slf4j-api-2.0.16.jar.sha1 new file mode 100644 index 000000000000..b1bb75be39b1 --- /dev/null +++ b/lucene/licenses/slf4j-api-2.0.16.jar.sha1 @@ -0,0 +1 @@ +0172931663a09a1fa515567af5fbef00897d3c04 diff --git a/versions.lock b/versions.lock index f3057288a9f7..07f8ff30543d 100644 --- a/versions.lock +++ b/versions.lock @@ -12,7 +12,7 @@ "org.antlr:antlr4-runtime:4.11.1" : "d9953130,refs=4", "org.apache.commons:commons-compress:1.19" : "5ce8cdc6,refs=2", "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", - "org.apache.opennlp:opennlp-tools:2.3.2" : "2f760bab,refs=4", + "org.apache.opennlp:opennlp-tools:2.5.3" : "2f760bab,refs=4", "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", "org.carrot2:morfologik-polish:2.1.9" : "fe494320,refs=3", "org.carrot2:morfologik-stemming:2.1.9" : "79af844b,refs=4", @@ -22,7 +22,7 @@ "org.ow2.asm:asm:9.6" : "d9953130,refs=4", "org.ow2.asm:asm-commons:9.6" : "d9953130,refs=4", "org.ow2.asm:asm-tree:9.6" : "d9953130,refs=4", - "org.slf4j:slf4j-api:1.7.36" : "2f760bab,refs=4", + "org.slf4j:slf4j-api:2.0.16" : "2f760bab,refs=4", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "fe494320,refs=3", "xerces:xercesImpl:2.12.0" : "5ce8cdc6,refs=2" }, @@ -56,7 +56,7 @@ "org.antlr:antlr4-runtime:4.11.1" : "6fbc4021,refs=5", "org.apache.commons:commons-compress:1.19" : "6f16ff86,refs=2", "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", - "org.apache.opennlp:opennlp-tools:2.3.2" : "b91715f0,refs=6", + "org.apache.opennlp:opennlp-tools:2.5.3" : "b91715f0,refs=6", "org.assertj:assertj-core:3.21.0" : "b7ba1646,refs=2", "org.carrot2:morfologik-fsa:2.1.9" : "e077a675,refs=8", "org.carrot2:morfologik-polish:2.1.9" : "cb00cecf,refs=5", @@ -73,7 +73,7 @@ "org.ow2.asm:asm-commons:9.6" : "6fbc4021,refs=5", "org.ow2.asm:asm-tree:9.6" : "6fbc4021,refs=5", "org.pcollections:pcollections:3.1.4" : "6897bc09,refs=38", - "org.slf4j:slf4j-api:1.7.36" : "b91715f0,refs=6", + "org.slf4j:slf4j-api:2.0.16" : "b91715f0,refs=6", "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "cb00cecf,refs=5", "xerces:xercesImpl:2.12.0" : "6f16ff86,refs=2" } diff --git a/versions.toml b/versions.toml index d1e693e03ca3..679287f9d7db 100644 --- a/versions.toml +++ b/versions.toml @@ -25,7 +25,7 @@ minJava = "21" morfologik = "2.1.9" morfologik-ukrainian = "4.9.1" nekohtml = "1.9.17" -opennlp = "2.3.2" +opennlp = "2.5.3" procfork = "1.0.6" randomizedtesting = "2.8.1" rat = "0.14" From 3bc4469b9d7a73d7bc03d75a839a4663aaa4e34f Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 29 Jan 2025 13:44:19 +0000 Subject: [PATCH 55/88] Add temp permissions, etc, to allow testing to succeed. with this I can minimally get the following to complete successfully: gradlew :lucene:sandbox:test --tests "org.apache.lucene.sandbox.vectorsearch.*" --- gradle/testing/defaults-tests.gradle | 2 +- gradle/testing/randomization/policies/tests.policy | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index 14e64647d667..be0004b72378 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -143,7 +143,7 @@ allprojects { ':lucene:codecs', ":lucene:distribution.tests", ":lucene:test-framework" - ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core') + ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core,com.nvidia.cuvs') def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties") def tempDir = layout.projectDirectory.dir(testsTmpDir.toString()) diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index f8e09ba03661..41cb5d60e44e 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -80,6 +80,12 @@ grant { permission java.io.FilePermission "${hunspell.corpora}${/}-", "read"; permission java.io.FilePermission "${hunspell.dictionaries}", "read"; permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read"; + + // TODO: these are just temporary to allow tesing with cuvs-java + permission java.lang.RuntimePermission "getenv.CUVS_JAVA_SO_PATH"; + permission java.io.FilePermission "${/}-", "read"; + // For temporary files to communicate with cuvs + permission java.io.FilePermission "${/}tmp${/}-", "write,delete"; }; // Permissions for jacoco code coverage From 705283f6b0bc5650a68bf8928b93831afe98ac25 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 29 Jan 2025 13:51:18 +0000 Subject: [PATCH 56/88] package-private where possible --- .../lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java | 2 +- .../java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java | 2 +- .../lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java | 2 +- .../org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java | 2 +- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java | 2 +- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java | 2 +- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 2 +- .../lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java | 2 +- .../apache/lucene/sandbox/vectorsearch/SegmentInputStream.java | 2 +- .../src/java/org/apache/lucene/sandbox/vectorsearch/Util.java | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index de2c7315f033..e712d69c1ef1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -23,7 +23,7 @@ import org.apache.lucene.index.FieldInfo; /** CuVS based fields writer */ -public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { +/*package-private*/ class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { public final String fieldName; public final ConcurrentHashMap vectors = diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 9258a04fc5c2..7b8c19996195 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -22,7 +22,7 @@ import java.util.Objects; /** This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) */ -public class CuVSIndex { +/*package-private*/ class CuVSIndex { private final CagraIndex cagraIndex; private final BruteForceIndex bruteforceIndex; private final List mapping; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java index 2f6c636590ef..efa4ce51e77a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -25,7 +25,7 @@ import org.apache.lucene.util.Bits; /** Query for CuVS */ -public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { +/*package-private*/ class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { private final int iTopK; private final int searchWidth; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java index ddbf8fc9d29e..9b12cdf61012 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -27,7 +27,7 @@ import java.util.zip.ZipOutputStream; /** Methods to deal with a CuVS composite file inside a segment */ -public class CuVSSegmentFile implements AutoCloseable { +/*package-private*/ class CuVSSegmentFile implements AutoCloseable { private final ZipOutputStream zos; private Set filesAdded = new HashSet(); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index d2f6c78417f5..96f1c889be5d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -25,7 +25,7 @@ import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; /** CuVS based KnnVectorsFormat for GPU acceleration */ -public class CuVSVectorsFormat extends KnnVectorsFormat { +/*package-private*/ class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; public static final String VECTOR_DATA_EXTENSION = "cag"; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index d65dfbd288cc..f23255792b84 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -52,7 +52,7 @@ import org.apache.lucene.util.IOUtils; /** KnnVectorsReader instance associated with CuVS format */ -public class CuVSVectorsReader extends KnnVectorsReader { +/*package-private*/ class CuVSVectorsReader extends KnnVectorsReader { // protected Logger log = Logger.getLogger(getClass().getName()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 2f595def8446..c652f5333a74 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -43,7 +43,7 @@ import org.apache.lucene.util.SuppressForbidden; /** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ -public class CuVSVectorsWriter extends KnnVectorsWriter { +/*package-private*/ class CuVSVectorsWriter extends KnnVectorsWriter { // protected Logger log = Logger.getLogger(getClass().getName()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java index ffba5f0c0f1f..23d524cef182 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -24,7 +24,7 @@ import org.apache.lucene.search.TotalHits; /** KnnCollector for CuVS */ -public class PerLeafCuVSKnnCollector implements KnnCollector { +/*package-private*/ class PerLeafCuVSKnnCollector implements KnnCollector { public List scoreDocs; public int topK = 0; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java index 73fba879f6ad..8f81c8bb7f15 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -21,7 +21,7 @@ import org.apache.lucene.store.IndexInput; /** InputStream semantics for reading from an IndexInput */ -public class SegmentInputStream extends InputStream { +/*package-private*/ class SegmentInputStream extends InputStream { /** */ private final IndexInput indexInput; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java index 35eaf35bc920..a19e7d4681a5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -26,7 +26,7 @@ import org.apache.commons.lang3.SerializationUtils; /** Some Utils used in CuVS integration */ -public class Util { +/*package-private*/ class Util { public static ByteArrayOutputStream getZipEntryBAOS( String fileName, SegmentInputStream segInputStream) throws IOException { From feb0e188139872248bef427a4b82ee259799c468 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Wed, 29 Jan 2025 10:12:36 -0500 Subject: [PATCH 57/88] Add knn result consistency test (#14167) --- .../search/BaseKnnVectorQueryTestCase.java | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java index 8a0d3b65aea9..49a35b75f151 100644 --- a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java +++ b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.util.HashSet; +import java.util.Random; import java.util.Set; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.document.Document; @@ -40,7 +41,9 @@ import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.QueryTimeout; +import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorEncoding; @@ -481,6 +484,62 @@ public void testSkewedIndex() throws IOException { } } + /** Tests with random vectors, number of documents, etc. Uses RandomIndexWriter. */ + public void testRandomConsistencySingleThreaded() throws IOException { + assertRandomConsistency(false); + } + + @AwaitsFix(bugUrl = "https://github.com/apache/lucene/issues/14180") + public void testRandomConsistencyMultiThreaded() throws IOException { + assertRandomConsistency(true); + } + + private void assertRandomConsistency(boolean multiThreaded) throws IOException { + int numDocs = 100; + int dimension = 4; + int numIters = 10; + boolean everyDocHasAVector = random().nextBoolean(); + Random r = random(); + try (Directory d = newDirectoryForTest()) { + // To ensure consistency between seeded runs, remove some randomness + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + iwc.setMergeScheduler(new SerialMergeScheduler()); + iwc.setMergePolicy(NoMergePolicy.INSTANCE); + iwc.setMaxBufferedDocs(numDocs); + iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH); + try (IndexWriter w = new IndexWriter(d, iwc)) { + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (everyDocHasAVector || random().nextInt(10) != 2) { + doc.add(getKnnVectorField("field", randomVector(dimension))); + } + w.addDocument(doc); + if (r.nextBoolean() && i % 50 == 0) { + w.flush(); + } + } + } + try (IndexReader reader = DirectoryReader.open(d)) { + IndexSearcher searcher = newSearcher(reader, true, true, multiThreaded); + // first get the initial set of docs, and we expect all future queries to be exactly the + // same + int k = random().nextInt(80) + 1; + AbstractKnnVectorQuery query = getKnnVectorQuery("field", randomVector(dimension), k); + int n = random().nextInt(100) + 1; + TopDocs expectedResults = searcher.search(query, n); + for (int i = 0; i < numIters; i++) { + TopDocs results = searcher.search(query, n); + assertEquals(expectedResults.totalHits.value(), results.totalHits.value()); + assertEquals(expectedResults.scoreDocs.length, results.scoreDocs.length); + for (int j = 0; j < results.scoreDocs.length; j++) { + assertEquals(expectedResults.scoreDocs[j].doc, results.scoreDocs[j].doc); + assertEquals(expectedResults.scoreDocs[j].score, results.scoreDocs[j].score, EPSILON); + } + } + } + } + } + /** Tests with random vectors, number of documents, etc. Uses RandomIndexWriter. */ public void testRandom() throws IOException { int numDocs = atLeast(100); From de4f07b5a30d0f68592a073abe328d704014cd4b Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Wed, 29 Jan 2025 11:10:23 -0500 Subject: [PATCH 58/88] Adjust knn merge stability testing (#14172) --- .../lucene/tests/index/BaseIndexFileFormatTestCase.java | 3 +++ .../lucene/tests/index/BaseKnnVectorsFormatTestCase.java | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java index 297c1b777f53..c2aa7ff0e4de 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java @@ -275,6 +275,9 @@ public void testMergeStability() throws Exception { new IndexWriterConfig(new MockAnalyzer(random())) .setUseCompoundFile(false) .setMergePolicy(mp); + if (VERBOSE) { + cfg.setInfoStream(System.out); + } IndexWriter w = new IndexWriter(dir, cfg); final int numDocs = atLeast(500); for (int i = 0; i < numDocs; ++i) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 752f21ea5d7a..97b578e7c5cd 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -124,6 +124,14 @@ protected void addRandomFields(Document doc) { } } + @Override + protected boolean mergeIsStable() { + // suppress this test from base class: merges for knn graphs are not stable due to connected + // components + // logic + return false; + } + private int getVectorsMaxDimensions(String fieldName) { return Codec.getDefault().knnVectorsFormat().getMaxDimensions(fieldName); } From faec0f823817ca95f1f103d6b9482d26ee75cc7b Mon Sep 17 00:00:00 2001 From: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com> Date: Thu, 30 Jan 2025 07:48:30 +0000 Subject: [PATCH 59/88] Do not enable security manager on JDK 24+ (#14179) This commit avoids setting the security manager on JDK 24+ - since it is not longer possible to enable it in JDK 24+ This is the minimum required to start testing with JDK 24 EA. --- gradle/testing/randomization.gradle | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle index 670f8ef2689e..185cd0872a9c 100644 --- a/gradle/testing/randomization.gradle +++ b/gradle/testing/randomization.gradle @@ -76,7 +76,9 @@ allprojects { [propName: 'tests.asserts', value: "true", description: "Enables or disables assertions mode."], [propName: 'tests.infostream', value: false, description: "Enables or disables infostream logs."], [propName: 'tests.leaveTemporary', value: false, description: "Leave temporary directories after tests complete."], - [propName: 'tests.useSecurityManager', value: true, description: "Control security manager in tests.", buildOnly: true], + [propName: 'tests.useSecurityManager', + value: { -> rootProject.ext.runtimeJavaVersion <= JavaVersion.VERSION_23 ? 'true' : 'false' }, + description: "Control security manager in tests.", buildOnly: true], // component randomization [propName: 'tests.codec', value: "random", description: "Sets the codec tests should run with."], [propName: 'tests.directory', value: "random", description: "Sets the Directory implementation tests should run with."], From d2c69c1472c7af87d7e619dfcabe207a4a614c20 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Sat, 1 Feb 2025 17:40:22 +0100 Subject: [PATCH 60/88] Remove `maxMergeAtOnce` option from `TieredMergePolicy`. (#14165) `maxMergeAtOnce` increases merge amplification by running multiple merges when it could run a single merge, without giving significant benefits in exchange. We removed this parameter for forced merges in #230. Let's row remove it for regular merges as well? In practice, merges above the floor segment size will be bounded by the number of segments per tier, and merges below the floor segment size will be unbounded. --- lucene/CHANGES.txt | 4 +- lucene/MIGRATE.md | 7 ++++ .../lucene/index/TieredMergePolicy.java | 37 ++----------------- .../lucene/document/TestManyKnnDocs.java | 1 - .../apache/lucene/index/TestAtomicUpdate.java | 2 +- .../index/TestConcurrentMergeScheduler.java | 1 - .../lucene/index/TestForTooMuchCloning.java | 2 +- .../lucene/index/TestForceMergeForever.java | 2 +- .../lucene/index/TestTieredMergePolicy.java | 27 +++++--------- .../replicator/nrt/SimplePrimaryNode.java | 1 - .../lucene/tests/util/LuceneTestCase.java | 10 ----- .../apache/lucene/tests/util/TestUtil.java | 1 - 12 files changed, 26 insertions(+), 69 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index fb9e7665a9e1..5b7c49860da9 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -9,6 +9,8 @@ API Changes --------------------- * GITHUB#11023: Removing deprecated parameters from CheckIndex. (Jakub Slowinski) +* GITHUB#14165: TieredMergePolicy's maxMergeAtOnce parameter was removed. (Adrien Grand) + New Features --------------------- * GITHUB#14097: Binary partitioning merge policy over float-valued vector field. (Mike Sokolov) @@ -16,7 +18,7 @@ New Features Improvements --------------------- -* GITHUB#266: TieredMergePolicy's maxMergeAtOnce default value was changed from 10 to 30. (Adrien Grand) +(No changes) Optimizations --------------------- diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 1db50b7fdd89..62a30c2444cb 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -17,6 +17,13 @@ # Apache Lucene Migration Guide +## Migration from Lucene 10.x to Lucene 11.0 + +### TieredMergePolicy#setMaxMergeAtOnce removed + +This parameter has no replacement, TieredMergePolicy no longer bounds the +number of segments that may be merged together. + ## Migration from Lucene 9.x to Lucene 10.0 ### DataInput#readVLong() may now read negative vlongs diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java index 7b56471c8bf4..70036ce9acb6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java @@ -30,9 +30,7 @@ /** * Merges segments of approximately equal size, subject to an allowed number of segments per tier. * This is similar to {@link LogByteSizeMergePolicy}, except this merge policy is able to merge - * non-adjacent segment, and separates how many segments are merged at once ({@link - * #setMaxMergeAtOnce}) from how many segments are allowed per tier ({@link #setSegmentsPerTier}). - * This merge policy also does not over-merge (i.e. cascade merges). + * non-adjacent segment. This merge policy also does not over-merge (i.e. cascade merges). * *

    For normal merging, this policy first computes a "budget" of how many segments are allowed to * be in the index. If the index is over-budget, then the policy sorts segments by decreasing size @@ -84,9 +82,6 @@ public class TieredMergePolicy extends MergePolicy { */ public static final double DEFAULT_NO_CFS_RATIO = 0.1; - // User-specified maxMergeAtOnce. In practice we always take the min of its - // value and segsPerTier for segments above the floor size to avoid suboptimal merging. - private int maxMergeAtOnce = 30; private long maxMergedSegmentBytes = 5 * 1024 * 1024 * 1024L; private long floorSegmentBytes = 2 * 1024 * 1024L; @@ -100,36 +95,12 @@ public TieredMergePolicy() { super(DEFAULT_NO_CFS_RATIO, MergePolicy.DEFAULT_MAX_CFS_SEGMENT_SIZE); } - /** - * Maximum number of segments to be merged at a time during "normal" merging. Default is 30. - * - *

    NOTE: Merges above the {@link #setFloorSegmentMB(double) floor segment size} also - * bound the number of merged segments by {@link #setSegmentsPerTier(double) the number of - * segments per tier}. - */ - public TieredMergePolicy setMaxMergeAtOnce(int v) { - if (v < 2) { - throw new IllegalArgumentException("maxMergeAtOnce must be > 1 (got " + v + ")"); - } - maxMergeAtOnce = v; - return this; - } - private enum MERGE_TYPE { NATURAL, FORCE_MERGE, FORCE_MERGE_DELETES } - /** - * Returns the current maxMergeAtOnce setting. - * - * @see #setMaxMergeAtOnce - */ - public int getMaxMergeAtOnce() { - return maxMergeAtOnce; - } - // TODO: should addIndexes do explicit merging, too? And, // if user calls IW.maybeMerge "explicitly" @@ -429,7 +400,7 @@ public MergeSpecification findMerges( } allowedDelCount = Math.max(0, allowedDelCount); - final int mergeFactor = (int) Math.min(maxMergeAtOnce, segsPerTier); + final int mergeFactor = (int) segsPerTier; // Compute max allowed segments for the remainder of the index long levelSize = Math.max(minSegmentBytes, floorSegmentBytes); long bytesLeft = totIndexBytes; @@ -570,7 +541,6 @@ private MergeSpecification doFindMerges( long docCountThisMerge = 0; for (int idx = startIdx; idx < sortedEligible.size() - && candidate.size() < maxMergeAtOnce // We allow merging more than mergeFactor segments together if the merged segment // would be less than the floor segment size. This is important because segments // below the floor segment size are more aggressively merged by this policy, so we @@ -733,7 +703,7 @@ protected MergeScore score( // matter in this case because this merge will not // "cascade" and so it cannot lead to N^2 merge cost // over time: - final int mergeFactor = (int) Math.min(maxMergeAtOnce, segsPerTier); + int mergeFactor = (int) segsPerTier; skew = 1.0 / mergeFactor; } else { skew = @@ -1021,7 +991,6 @@ private long floorSize(long bytes) { @Override public String toString() { StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": "); - sb.append("maxMergeAtOnce=").append(maxMergeAtOnce).append(", "); sb.append("maxMergedSegmentMB=").append(maxMergedSegmentBytes / 1024. / 1024.).append(", "); sb.append("floorSegmentMB=").append(floorSegmentBytes / 1024. / 1024.).append(", "); sb.append("forceMergeDeletesPctAllowed=").append(forceMergeDeletesPctAllowed).append(", "); diff --git a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java index 2023ee73391d..7db87e231ff0 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestManyKnnDocs.java @@ -43,7 +43,6 @@ public void testLargeSegment() throws Exception { 128)); // Make sure to use the ConfigurableMCodec instead of a random one iwc.setRAMBufferSizeMB(64); // Use a 64MB buffer to create larger initial segments TieredMergePolicy mp = new TieredMergePolicy(); - mp.setMaxMergeAtOnce(256); // avoid intermediate merges (waste of time with HNSW?) mp.setSegmentsPerTier(256); // only merge once at the end when we ask iwc.setMergePolicy(mp); String fieldName = "field"; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAtomicUpdate.java b/lucene/core/src/test/org/apache/lucene/index/TestAtomicUpdate.java index b951a565d623..5dc8327c7997 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAtomicUpdate.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAtomicUpdate.java @@ -109,7 +109,7 @@ public void runTest(Directory directory) throws Exception { IndexWriterConfig conf = new IndexWriterConfig(new MockAnalyzer(random())).setMaxBufferedDocs(7); - ((TieredMergePolicy) conf.getMergePolicy()).setMaxMergeAtOnce(3); + ((TieredMergePolicy) conf.getMergePolicy()).setSegmentsPerTier(3); IndexWriter writer = RandomIndexWriter.mockIndexWriter(directory, conf, random()); // Establish a base index of 100 docs: diff --git a/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java b/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java index fcf42177570e..0d52481b908e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java @@ -375,7 +375,6 @@ protected void doMerge(MergeSource mergeSource, MergePolicy.OneMerge merge) TieredMergePolicy tmp = new TieredMergePolicy(); iwc.setMergePolicy(tmp); - tmp.setMaxMergeAtOnce(2); tmp.setSegmentsPerTier(2); IndexWriter w = new IndexWriter(dir, iwc); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java index d06330c29269..924700835403 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java @@ -36,7 +36,7 @@ public class TestForTooMuchCloning extends LuceneTestCase { public void test() throws Exception { final MockDirectoryWrapper dir = newMockDirectory(); final TieredMergePolicy tmp = new TieredMergePolicy(); - tmp.setMaxMergeAtOnce(2); + tmp.setSegmentsPerTier(2); final RandomIndexWriter w = new RandomIndexWriter( random(), diff --git a/lucene/core/src/test/org/apache/lucene/index/TestForceMergeForever.java b/lucene/core/src/test/org/apache/lucene/index/TestForceMergeForever.java index 78514b2c3ed5..a5cf2b67f488 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestForceMergeForever.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestForceMergeForever.java @@ -70,7 +70,7 @@ public void test() throws Exception { MergePolicy mp = w.getConfig().getMergePolicy(); final int mergeAtOnce = 1 + w.cloneSegmentInfos().size(); if (mp instanceof TieredMergePolicy) { - ((TieredMergePolicy) mp).setMaxMergeAtOnce(mergeAtOnce); + ((TieredMergePolicy) mp).setSegmentsPerTier(mergeAtOnce); } else if (mp instanceof LogMergePolicy) { ((LogMergePolicy) mp).setMergeFactor(mergeAtOnce); } else { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java index 58e33cb7648e..6f11f71f3a99 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java @@ -104,7 +104,7 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws // below we make the assumption that segments that reached the max segment // size divided by 2 don't need merging anymore - int mergeFactor = (int) Math.min(tmp.getSegmentsPerTier(), tmp.getMaxMergeAtOnce()); + int mergeFactor = (int) tmp.getSegmentsPerTier(); while (true) { final double segCountLevel = bytesLeft / (double) levelSizeBytes; if (segCountLevel <= tmp.getSegmentsPerTier() @@ -145,12 +145,11 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws assertTrue( String.format( Locale.ROOT, - "mergeFactor=%d minSegmentBytes=%,d maxMergedSegmentBytes=%,d segmentsPerTier=%g maxMergeAtOnce=%d numSegments=%d allowed=%g totalBytes=%,d delPercentage=%g deletesPctAllowed=%g targetNumSegments=%d", + "mergeFactor=%d minSegmentBytes=%,d maxMergedSegmentBytes=%,d segmentsPerTier=%g numSegments=%d allowed=%g totalBytes=%,d delPercentage=%g deletesPctAllowed=%g targetNumSegments=%d", mergeFactor, minSegmentBytes, maxMergedSegmentBytes, tmp.getSegmentsPerTier(), - tmp.getMaxMergeAtOnce(), numSegments, allowedSegCount, totalBytes, @@ -162,10 +161,7 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws @Override protected void assertMerge(MergePolicy policy, MergeSpecification merges) { - TieredMergePolicy tmp = (TieredMergePolicy) policy; - for (OneMerge merge : merges.merges) { - assertTrue(merge.segments.size() <= tmp.getMaxMergeAtOnce()); - } + // anything to assert? } public void testForceMergeDeletes() throws Exception { @@ -174,7 +170,6 @@ public void testForceMergeDeletes() throws Exception { TieredMergePolicy tmp = newTieredMergePolicy(); conf.setMergePolicy(tmp); conf.setMaxBufferedDocs(4); - tmp.setMaxMergeAtOnce(100); tmp.setSegmentsPerTier(100); tmp.setDeletesPctAllowed(50.0); tmp.setForceMergeDeletesPctAllowed(30.0); @@ -219,8 +214,8 @@ public void testPartialMerge() throws Exception { TieredMergePolicy tmp = newTieredMergePolicy(); conf.setMergePolicy(tmp); conf.setMaxBufferedDocs(2); - tmp.setMaxMergeAtOnce(3); tmp.setSegmentsPerTier(6); + tmp.setFloorSegmentMB(Double.MIN_VALUE); IndexWriter w = new IndexWriter(dir, conf); int maxCount = 0; @@ -231,7 +226,7 @@ public void testPartialMerge() throws Exception { w.addDocument(doc); int count = w.getSegmentCount(); maxCount = Math.max(count, maxCount); - assertTrue("count=" + count + " maxCount=" + maxCount, count >= maxCount - 3); + assertTrue("count=" + count + " maxCount=" + maxCount, count >= maxCount - 6); } w.flush(true, true); @@ -973,15 +968,13 @@ public void testMergeSizeIsLessThanFloorSize() throws IOException { assertEquals(15, oneMerge.segments.size()); } - // Segments are below the floor segment size and we'd need to merge more than maxMergeAtOnce - // segments to go above the minimum segment size. We get 1 merge of maxMergeAtOnce=30 segments - // and 1 merge of 50-30=20 segments. + // Segments are below the floor segment size. We get one merge that merges the 50 segments + // together. mergePolicy.setFloorSegmentMB(60); mergeSpec = mergePolicy.findMerges(MergeTrigger.FULL_FLUSH, infos, mergeContext); assertNotNull(mergeSpec); - assertEquals(2, mergeSpec.merges.size()); - assertEquals(30, mergeSpec.merges.get(0).segments.size()); - assertEquals(20, mergeSpec.merges.get(1).segments.size()); + assertEquals(1, mergeSpec.merges.size()); + assertEquals(50, mergeSpec.merges.get(0).segments.size()); } public void testFullFlushMerges() throws IOException { @@ -1008,6 +1001,6 @@ public void testFullFlushMerges() throws IOException { segmentInfos = applyMerge(segmentInfos, merge, "_" + segNameGenerator.getAndIncrement(), stats); } - assertEquals(2, segmentInfos.size()); + assertEquals(1, segmentInfos.size()); } } diff --git a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java index 8c1f5fd71e3a..c05f5e028a08 100644 --- a/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java +++ b/lucene/replicator/src/test/org/apache/lucene/replicator/nrt/SimplePrimaryNode.java @@ -164,7 +164,6 @@ private static IndexWriter initWriter( if (mp instanceof TieredMergePolicy) { TieredMergePolicy tmp = (TieredMergePolicy) mp; tmp.setSegmentsPerTier(3); - tmp.setMaxMergeAtOnce(3); } else if (mp instanceof LogMergePolicy) { LogMergePolicy lmp = (LogMergePolicy) mp; lmp.setMergeFactor(3); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java index 84fa120b88b1..ff4eb908e9b0 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java @@ -1096,11 +1096,6 @@ private static void configureRandom(Random r, MergePolicy mergePolicy) { public static TieredMergePolicy newTieredMergePolicy(Random r) { TieredMergePolicy tmp = new TieredMergePolicy(); - if (rarely(r)) { - tmp.setMaxMergeAtOnce(TestUtil.nextInt(r, 2, 9)); - } else { - tmp.setMaxMergeAtOnce(TestUtil.nextInt(r, 10, 50)); - } if (rarely(r)) { tmp.setMaxMergedSegmentMB(0.2 + r.nextDouble() * 2.0); } else { @@ -1235,11 +1230,6 @@ public static void maybeChangeLiveIndexWriterConfig(Random r, LiveIndexWriterCon } } else if (mp instanceof TieredMergePolicy) { TieredMergePolicy tmp = (TieredMergePolicy) mp; - if (rarely(r)) { - tmp.setMaxMergeAtOnce(TestUtil.nextInt(r, 2, 9)); - } else { - tmp.setMaxMergeAtOnce(TestUtil.nextInt(r, 10, 50)); - } if (rarely(r)) { tmp.setMaxMergedSegmentMB(0.2 + r.nextDouble() * 2.0); } else { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java index 6715edecc166..c2f5d886e3c0 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java @@ -1453,7 +1453,6 @@ public static void reduceOpenFiles(IndexWriter w) { lmp.setMergeFactor(Math.min(5, lmp.getMergeFactor())); } else if (mp instanceof TieredMergePolicy) { TieredMergePolicy tmp = (TieredMergePolicy) mp; - tmp.setMaxMergeAtOnce(Math.min(5, tmp.getMaxMergeAtOnce())); tmp.setSegmentsPerTier(Math.min(5, tmp.getSegmentsPerTier())); } MergeScheduler ms = w.getConfig().getMergeScheduler(); From b429c432b13b905b50781e8aa00a191332c29cbd Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Sat, 1 Feb 2025 17:41:53 +0100 Subject: [PATCH 61/88] Fix refill logic in nextDoc(). (#14185) The recent optimization from #14164 interfered in a bad way with a prior optimization. --- .../lucene101/Lucene101PostingsReader.java | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java index 04b39c23974b..b73e6316a7dd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java @@ -888,16 +888,7 @@ private void skipLevel0To(int target) throws IOException { public void advanceShallow(int target) throws IOException { if (target > level0LastDocID) { // advance level 0 skip data doAdvanceShallow(target); - - // If we are on the last doc ID of a block and we are advancing on the doc ID just beyond - // this block, then we decode the block. This may not be necessary, but this helps avoid - // having to check whether we are in a block that is not decoded yet in #nextDoc(). - if (docBufferUpto == BLOCK_SIZE && target == doc + 1) { - refillDocs(); - needsRefilling = false; - } else { - needsRefilling = true; - } + needsRefilling = true; } } @@ -914,8 +905,13 @@ private void doAdvanceShallow(int target) throws IOException { @Override public int nextDoc() throws IOException { - if (doc == level0LastDocID) { - moveToNextLevel0Block(); + if (doc == level0LastDocID || needsRefilling) { + if (needsRefilling) { + refillDocs(); + needsRefilling = false; + } else { + moveToNextLevel0Block(); + } } switch (encoding) { From dd76dc4973fdbf554f0416902ead560da00bdad6 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Sat, 1 Feb 2025 17:43:15 +0100 Subject: [PATCH 62/88] Allow `LogMergePolicy` to merge more than `mergeFactor` segments together when the merge is below the min merge size. (#14166) This is essentially porting #266 to `LogMergePolicy`. By allowing more than `mergeFactor` segments to be merged together for small merges, the merge policy gets a lower write amplification and indexes have fewer small segments. --- lucene/CHANGES.txt | 4 +++ .../apache/lucene/index/LogMergePolicy.java | 25 +++++++++++++++++++ .../lucene/index/TestLogMergePolicy.java | 7 +++++- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5b7c49860da9..6eb52ac0b9ba 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -71,6 +71,10 @@ Improvements individual and bulk data retrieval overloads; avoid double buffering with slices. (Chris Hegarty) +* GITHUB#14166: Log(ByteSize|Doc)MergePolicy now allow merging more than + mergeFactor segments together when the merge is below the min merge size. + (Adrien Grand) + Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java index 881ae099d5f8..b6dc9848c9df 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java +++ b/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java @@ -636,6 +636,31 @@ public MergeSpecification findMerges( mergeDocs += segmentDocs; } + if (end - start >= mergeFactor + && minMergeSize < maxMergeSize + && mergeSize < minMergeSize + && anyMerging == false) { + // If the merge has mergeFactor segments but is still smaller than the min merged segment + // size, keep packing candidate segments. + while (end < 1 + upto) { + final SegmentInfoAndLevel segLevel = levels.get(end); + final SegmentCommitInfo info = segLevel.info; + if (mergingSegments.contains(info)) { + anyMerging = true; + break; + } + long segmentSize = size(info, mergeContext); + long segmentDocs = sizeDocs(info, mergeContext); + if (mergeSize + segmentSize > minMergeSize || mergeDocs + segmentDocs > maxMergeDocs) { + break; + } + + mergeSize += segmentSize; + mergeDocs += segmentDocs; + end++; + } + } + if (anyMerging || end - start <= 1) { // skip: there is an ongoing merge at the current level or the computed merge has a single // segment and this merge policy doesn't do singleton merges diff --git a/lucene/core/src/test/org/apache/lucene/index/TestLogMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestLogMergePolicy.java index ea60f9b1e090..8c121c1f6a05 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestLogMergePolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestLogMergePolicy.java @@ -50,8 +50,13 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws @Override protected void assertMerge(MergePolicy policy, MergeSpecification merge) throws IOException { LogMergePolicy lmp = (LogMergePolicy) policy; + MergeContext mockMergeContext = new MockMergeContext(SegmentCommitInfo::getDelCount); for (OneMerge oneMerge : merge.merges) { - assertTrue(oneMerge.segments.size() <= lmp.getMergeFactor()); + long mergeSize = 0; + for (SegmentCommitInfo info : oneMerge.segments) { + mergeSize += lmp.size(info, mockMergeContext); + } + assertTrue(mergeSize < lmp.minMergeSize || oneMerge.segments.size() <= lmp.getMergeFactor()); } } From 834e5607289de9a09d42eb3cbc8df8ca97bf4928 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sun, 2 Feb 2025 15:19:55 +0000 Subject: [PATCH 63/88] minimal update for the new cuvs-java api modifications --- lucene/licenses/cuvs-java-25.02.0.jar.sha1 | 1 + lucene/licenses/cuvs-java-25.02.jar.sha1 | 1 - .../sandbox/vectorsearch/CuVSCodec.java | 4 +-- .../vectorsearch/CuVSVectorsFormat.java | 10 +++---- .../vectorsearch/CuVSVectorsReader.java | 8 ++--- .../vectorsearch/CuVSVectorsWriter.java | 29 ++++++++++--------- versions.lock | 4 +-- versions.toml | 2 +- 8 files changed, 30 insertions(+), 29 deletions(-) create mode 100644 lucene/licenses/cuvs-java-25.02.0.jar.sha1 delete mode 100644 lucene/licenses/cuvs-java-25.02.jar.sha1 diff --git a/lucene/licenses/cuvs-java-25.02.0.jar.sha1 b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 new file mode 100644 index 000000000000..f4abed6a16c0 --- /dev/null +++ b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 @@ -0,0 +1 @@ +bee6c3f5bfdc4a4d21a079f8fc2837c42eb37560 diff --git a/lucene/licenses/cuvs-java-25.02.jar.sha1 b/lucene/licenses/cuvs-java-25.02.jar.sha1 deleted file mode 100644 index 42b4dae43805..000000000000 --- a/lucene/licenses/cuvs-java-25.02.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -870f2aed1a4633489cc9c3d33128683e668a0f30 diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index 32ca1077887c..f455a863a9a1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -16,7 +16,7 @@ */ package org.apache.lucene.sandbox.vectorsearch; -import com.nvidia.cuvs.LibraryNotFoundException; +import com.nvidia.cuvs.LibraryException; import java.util.logging.Logger; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; @@ -37,7 +37,7 @@ public CuVSCodec(String name, Codec delegate) { try { format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE); setKnnFormat(format); - } catch (LibraryNotFoundException ex) { + } catch (LibraryException ex) { Logger log = Logger.getLogger(CuVSCodec.class.getName()); log.severe("Couldn't load native library, possible classloader issue. " + ex.getMessage()); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 96f1c889be5d..dfc224bf6309 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -17,7 +17,7 @@ package org.apache.lucene.sandbox.vectorsearch; import com.nvidia.cuvs.CuVSResources; -import com.nvidia.cuvs.LibraryNotFoundException; +import com.nvidia.cuvs.LibraryException; import java.io.IOException; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.index.SegmentReadState; @@ -44,7 +44,7 @@ public CuVSVectorsFormat() { this.intGraphDegree = 128; this.graphDegree = 64; try { - resources = new CuVSResources(); + resources = CuVSResources.create(); } catch (Throwable e) { throw new RuntimeException(e); } @@ -52,15 +52,15 @@ public CuVSVectorsFormat() { public CuVSVectorsFormat( int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) - throws LibraryNotFoundException { + throws LibraryException { super("CuVSVectorsFormat"); this.mergeStrategy = mergeStrategy; this.cuvsWriterThreads = cuvsWriterThreads; this.intGraphDegree = intGraphDegree; this.graphDegree = graphDegree; try { - resources = new CuVSResources(); - } catch (LibraryNotFoundException ex) { + resources = CuVSResources.create(); + } catch (LibraryException ex) { throw ex; } catch (Throwable e) { throw new RuntimeException(e); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index f23255792b84..b93d7113036d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -160,7 +160,7 @@ private Map> loadCuVSIndex(ZipInputStream zis, boolean i { cagraIndexes.put( segmentField, - new CagraIndex.Builder(resources) + CagraIndex.newBuilder(resources) .from(new ByteArrayInputStream(baos.toByteArray())) .build()); break; @@ -169,17 +169,17 @@ private Map> loadCuVSIndex(ZipInputStream zis, boolean i { bruteforceIndexes.put( segmentField, - new BruteForceIndex.Builder(resources) + BruteForceIndex.newBuilder(resources) .from(new ByteArrayInputStream(baos.toByteArray())) .build()); break; } case "hnsw": { - HnswIndexParams indexParams = new HnswIndexParams.Builder(resources).build(); + HnswIndexParams indexParams = new HnswIndexParams.Builder().build(); hnswIndexes.put( segmentField, - new HnswIndex.Builder(resources) + HnswIndex.newBuilder(resources) .from(new ByteArrayInputStream(baos.toByteArray())) .withIndexParams(indexParams) .build()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index c652f5333a74..9de52248004f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -23,9 +23,10 @@ import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; import com.nvidia.cuvs.CuVSResources; import java.io.ByteArrayOutputStream; -import java.io.File; import java.io.IOException; import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; @@ -55,11 +56,11 @@ private CagraIndex cagraIndex; private CagraIndex cagraIndexForHnsw; - private int cuvsWriterThreads; - private int intGraphDegree; - private int graphDegree; - private MergeStrategy mergeStrategy; - private CuVSResources resources; + private final int cuvsWriterThreads; + private final int intGraphDegree; + private final int graphDegree; + private final MergeStrategy mergeStrategy; + private final CuVSResources resources; /** Merge strategy used for CuVS */ public enum MergeStrategy { @@ -113,7 +114,7 @@ public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException @SuppressForbidden(reason = "A temporary java.util.File is needed for Cagra's serialization") private byte[] createCagraIndex(float[][] vectors, List mapping) throws Throwable { CagraIndexParams indexParams = - new CagraIndexParams.Builder(resources) + new CagraIndexParams.Builder() .withNumWriterThreads(cuvsWriterThreads) .withIntermediateGraphDegree(intGraphDegree) .withGraphDegree(graphDegree) @@ -122,13 +123,13 @@ private byte[] createCagraIndex(float[][] vectors, List mapping) throws // log.info("Indexing started: " + System.currentTimeMillis()); cagraIndex = - new CagraIndex.Builder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + // vectors.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); - File tmpFile = - File.createTempFile( + Path tmpFile = + Files.createTempFile( "tmpindex", "cag"); // TODO: Should we make this a file with random names? cagraIndex.serialize(baos, tmpFile); return baos.toByteArray(); @@ -143,7 +144,7 @@ private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { // log.info("Indexing started: " + System.currentTimeMillis()); BruteForceIndex index = - new BruteForceIndex.Builder(resources) + BruteForceIndex.newBuilder(resources) .withIndexParams(indexParams) .withDataset(vectors) .build(); @@ -157,7 +158,7 @@ private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { @SuppressForbidden(reason = "A temporary java.util.File is needed for HNSW's serialization") private byte[] createHnswIndex(float[][] vectors) throws Throwable { CagraIndexParams indexParams = - new CagraIndexParams.Builder(resources) + new CagraIndexParams.Builder() .withNumWriterThreads(cuvsWriterThreads) .withIntermediateGraphDegree(intGraphDegree) .withGraphDegree(graphDegree) @@ -166,12 +167,12 @@ private byte[] createHnswIndex(float[][] vectors) throws Throwable { // log.info("Indexing started: " + System.currentTimeMillis()); cagraIndexForHnsw = - new CagraIndex.Builder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + // vectors.length); ByteArrayOutputStream baos = new ByteArrayOutputStream(); - File tmpFile = File.createTempFile("tmpindex", "hnsw"); + Path tmpFile = Files.createTempFile("tmpindex", "hnsw"); cagraIndexForHnsw.serializeToHNSW(baos, tmpFile); return baos.toByteArray(); } diff --git a/versions.lock b/versions.lock index dfa465a1b3fe..b9d5fa0a17a1 100644 --- a/versions.lock +++ b/versions.lock @@ -4,7 +4,7 @@ "main_dependencies" : { "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1" : "fa9ef26b,refs=4", "com.ibm.icu:icu4j:74.2" : "47ea4550,refs=6", - "com.nvidia.cuvs:cuvs-java:25.02" : "0129b4f0,refs=6", + "com.nvidia.cuvs:cuvs-java:25.02.0" : "0129b4f0,refs=6", "commons-codec:commons-codec:1.13" : "e9962aab,refs=4", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.1" : "fa9ef26b,refs=4", @@ -48,7 +48,7 @@ "com.google.j2objc:j2objc-annotations:1.3" : "6897bc09,refs=38", "com.google.protobuf:protobuf-java:3.19.2" : "6897bc09,refs=38", "com.ibm.icu:icu4j:74.2" : "ffa00415,refs=8", - "com.nvidia.cuvs:cuvs-java:25.02" : "7ac6f8d9,refs=9", + "com.nvidia.cuvs:cuvs-java:25.02.0" : "7ac6f8d9,refs=9", "commons-codec:commons-codec:1.13" : "733734f0,refs=6", "io.github.java-diff-utils:java-diff-utils:4.0" : "6897bc09,refs=38", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", diff --git a/versions.toml b/versions.toml index d0db5fd20d9d..06c2247422a4 100644 --- a/versions.toml +++ b/versions.toml @@ -5,7 +5,7 @@ assertj = "3.21.0" commons-codec = "1.13" commons-compress = "1.19" commons-lang3 = "3.17.0" -cuvs = "25.02" +cuvs = "25.02.0" ecj = "3.36.0" errorprone = "2.18.0" flexmark = "0.61.24" From 3772c4c46c7996fd00291b9afc997bdd244da1a0 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sun, 2 Feb 2025 15:34:56 +0000 Subject: [PATCH 64/88] add filter cuvs service provider --- lucene/licenses/cuvs-java-25.02.0.jar.sha1 | 2 +- lucene/sandbox/src/java/module-info.java | 2 + .../vectorsearch/FilterCuVSProvider.java | 43 +++++++++++++++++++ .../FilterCuVSServiceProvider.java | 11 +++++ .../com.nvidia.cuvs.spi.CuVSServiceProvider | 16 +++++++ 5 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java create mode 100644 lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider diff --git a/lucene/licenses/cuvs-java-25.02.0.jar.sha1 b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 index f4abed6a16c0..ccb02e86aa8c 100644 --- a/lucene/licenses/cuvs-java-25.02.0.jar.sha1 +++ b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 @@ -1 +1 @@ -bee6c3f5bfdc4a4d21a079f8fc2837c42eb37560 +0086126edbd145e5d0be65e6157e96e3e8a2ebca diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 051c1df0a257..8b182a6e050c 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -45,4 +45,6 @@ // org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; provides org.apache.lucene.codecs.Codec with org.apache.lucene.sandbox.vectorsearch.CuVSCodec; + provides com.nvidia.cuvs.spi.CuVSServiceProvider with + org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java new file mode 100644 index 000000000000..641ef40acb3b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java @@ -0,0 +1,43 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.spi.CuVSProvider; + +import java.nio.file.Path; + +public class FilterCuVSProvider implements CuVSProvider { + + private final CuVSProvider delegate; + + FilterCuVSProvider(CuVSProvider delegate) { + this.delegate = delegate; + } + + @Override + public Path nativeLibraryPath() { + return CuVSProvider.TMPDIR; + } + + @Override + public CuVSResources newCuVSResources(Path tempPath) throws Throwable { + return delegate.newCuVSResources(tempPath); + } + + @Override + public BruteForceIndex.Builder newBruteForceIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { + return delegate.newBruteForceIndexBuilder(cuVSResources); + } + + @Override + public CagraIndex.Builder newCagraIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { + return delegate.newCagraIndexBuilder(cuVSResources); + } + + @Override + public HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { + return delegate.newHnswIndexBuilder(cuVSResources); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java new file mode 100644 index 000000000000..7840b07a86cc --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java @@ -0,0 +1,11 @@ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.spi.CuVSProvider; +import com.nvidia.cuvs.spi.CuVSServiceProvider; + +public class FilterCuVSServiceProvider extends CuVSServiceProvider { + @Override + public CuVSProvider get(CuVSProvider builtinProvider) { + return new FilterCuVSProvider(builtinProvider); + } +} diff --git a/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider b/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider new file mode 100644 index 000000000000..5e7ceba19343 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider \ No newline at end of file From b13d37f0f4d5332a1a15d9dd3fae074e45ea8f94 Mon Sep 17 00:00:00 2001 From: Amir Raza <62626184+pseudo-nymous@users.noreply.github.com> Date: Sun, 2 Feb 2025 23:41:39 +0530 Subject: [PATCH 65/88] Use github wf to add module labels for PR based on file changes (#14101) --- .github/labeler.yml | 130 +++++++++++++++++++++++ .github/workflows/label-pull-request.yml | 23 ++++ 2 files changed, 153 insertions(+) create mode 100644 .github/labeler.yml create mode 100644 .github/workflows/label-pull-request.yml diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 000000000000..97c040337b80 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,130 @@ +# This file defines module label mappings for the Lucene project. +# Each module is associated with a set of file globs that, when matched, +# will trigger the corresponding label to be applied to pull requests. +# +# This configuration is used by the workflow defined in .github/workflows/label-pull-request.yml. +# If we are adding new labels or refactoring modules, we will need to modify this file globs here to ensure that the correct labels are applied. + +# For more information on how to define globs, visit: https://github.com/actions/labeler + +module:analysis: + - changed-files: + - any-glob-to-any-file: 'lucene/analysis/**' + +module:benchmark: + - changed-files: + - any-glob-to-any-file: 'lucene/benchmark/**' + +module:classification: + - changed-files: + - any-glob-to-any-file: 'lucene/classification/**' + +module:core/codecs: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/codecs/**', 'lucene/core/src/test/org/apache/lucene/codecs/**'] + +module:core/FSTs: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/util/fst/**', 'lucene/core/src/test/org/apache/lucene/util/fst/**'] + +module:core/hnsw: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/util/hnsw/**', 'lucene/core/src/test/org/apache/lucene/util/hnsw/**'] + +module:core/index: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/index/**', 'lucene/core/src/test/org/apache/lucene/index/**'] + +module:core/search: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/search/**', 'lucene/core/src/test/org/apache/lucene/search/**'] + +module:core/store: + - changed-files: + - any-glob-to-any-file: ['lucene/core/src/java/org/apache/lucene/store/**', 'lucene/core/src/test/org/apache/lucene/store/**'] + +module:core/other: + - all: + - changed-files: + - any-glob-to-any-file: ['lucene/core/**'] + - all-globs-to-all-files: + - '!lucene/core/src/java/org/apache/lucene/codecs/**' + - '!lucene/core/src/test/org/apache/lucene/codecs/**' + - '!lucene/core/src/java/org/apache/lucene/util/fst/**' + - '!lucene/core/src/test/org/apache/lucene/util/fst/**' + - '!lucene/core/src/java/org/apache/lucene/util/hnsw/**' + - '!lucene/core/src/test/org/apache/lucene/util/hnsw/**' + - '!lucene/core/src/java/org/apache/lucene/index/**' + - '!lucene/core/src/test/org/apache/lucene/index/**' + - '!lucene/core/src/java/org/apache/lucene/search/**' + - '!lucene/core/src/test/org/apache/lucene/search/**' + - '!lucene/core/src/java/org/apache/lucene/store/**' + - '!lucene/core/src/test/org/apache/lucene/store/**' + +module:demo: + - changed-files: + - any-glob-to-any-file: 'lucene/demo/**' + +module:expressions: + - changed-files: + - any-glob-to-any-file: 'lucene/expressions/**' + +module:facet: + - changed-files: + - any-glob-to-any-file: 'lucene/facet/**' + +module:grouping: + - changed-files: + - any-glob-to-any-file: 'lucene/grouping/**' + +module:highlighter: + - changed-files: + - any-glob-to-any-file: 'lucene/highlighter/**' + +module:join: + - changed-files: + - any-glob-to-any-file: 'lucene/join/**' + +module:luke: + - changed-files: + - any-glob-to-any-file: 'lucene/luke/**' + +module:misc: + - changed-files: + - any-glob-to-any-file: 'lucene/misc/**' + +module:monitor: + - changed-files: + - any-glob-to-any-file: 'lucene/monitor/**' + +module:queries: + - changed-files: + - any-glob-to-any-file: 'lucene/queries/**' + +module:queryparser: + - changed-files: + - any-glob-to-any-file: 'lucene/queryparser/**' + +module:replicator: + - changed-files: + - any-glob-to-any-file: 'lucene/replicator/**' + +module:sandbox: + - changed-files: + - any-glob-to-any-file: 'lucene/sandbox/**' + +module:spatial: + - changed-files: + - any-glob-to-any-file: ['lucene/spatial-extras/**', 'lucene/spatial-test-fixtures/**'] + +module:spatial3d: + - changed-files: + - any-glob-to-any-file: 'lucene/spatial3d/**' + +module:suggest: + - changed-files: + - any-glob-to-any-file: 'lucene/suggest/**' + +module:test-framework: + - changed-files: + - any-glob-to-any-file: 'lucene/test-framework/**' diff --git a/.github/workflows/label-pull-request.yml b/.github/workflows/label-pull-request.yml new file mode 100644 index 000000000000..19932d51c04c --- /dev/null +++ b/.github/workflows/label-pull-request.yml @@ -0,0 +1,23 @@ +# This file defines the workflow for labeling pull requests with module tags based on the changed files in the PR. +# It uses the `actions/labeler` GitHub Action to achieve the same. +# +# The workflow is triggered on the `pull_request_target` event which ensures workflow is only run from the master branch. +# The job `labeler` runs on `ubuntu-latest` and has permissions to read contents and write pull requests. +# +# For more information on the `actions/labeler` GitHub Action, refer to https://github.com/actions/labeler + +name: "Pull Request Labeler" +run-name: Labelling pull request with module tags based on changed files in the PR +on: + - pull_request_target + +jobs: + labeler: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@v5 + with: + sync-labels: true \ No newline at end of file From 8453bb1832ca09daf1e17c63194e2ce71ef2f5f4 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sun, 2 Feb 2025 19:05:42 +0000 Subject: [PATCH 66/88] cleanup --- .../vectorsearch/FilterCuVSProvider.java | 80 ++++++++++++------- .../FilterCuVSServiceProvider.java | 16 ++++ 2 files changed, 65 insertions(+), 31 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java index 641ef40acb3b..155d9301ab36 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import com.nvidia.cuvs.BruteForceIndex; @@ -5,39 +21,41 @@ import com.nvidia.cuvs.CuVSResources; import com.nvidia.cuvs.HnswIndex; import com.nvidia.cuvs.spi.CuVSProvider; - import java.nio.file.Path; public class FilterCuVSProvider implements CuVSProvider { - private final CuVSProvider delegate; - - FilterCuVSProvider(CuVSProvider delegate) { - this.delegate = delegate; - } - - @Override - public Path nativeLibraryPath() { - return CuVSProvider.TMPDIR; - } - - @Override - public CuVSResources newCuVSResources(Path tempPath) throws Throwable { - return delegate.newCuVSResources(tempPath); - } - - @Override - public BruteForceIndex.Builder newBruteForceIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { - return delegate.newBruteForceIndexBuilder(cuVSResources); - } - - @Override - public CagraIndex.Builder newCagraIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { - return delegate.newCagraIndexBuilder(cuVSResources); - } - - @Override - public HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) throws UnsupportedOperationException { - return delegate.newHnswIndexBuilder(cuVSResources); - } + private final CuVSProvider delegate; + + FilterCuVSProvider(CuVSProvider delegate) { + this.delegate = delegate; + } + + @Override + public Path nativeLibraryPath() { + return CuVSProvider.TMPDIR; + } + + @Override + public CuVSResources newCuVSResources(Path tempPath) throws Throwable { + return delegate.newCuVSResources(tempPath); + } + + @Override + public BruteForceIndex.Builder newBruteForceIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newBruteForceIndexBuilder(cuVSResources); + } + + @Override + public CagraIndex.Builder newCagraIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newCagraIndexBuilder(cuVSResources); + } + + @Override + public HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newHnswIndexBuilder(cuVSResources); + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java index 7840b07a86cc..65dbf5d14737 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.lucene.sandbox.vectorsearch; import com.nvidia.cuvs.spi.CuVSProvider; From 2bce954d07477c8871c62c27958ec7c360c9f07d Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Mon, 3 Feb 2025 17:03:09 +0000 Subject: [PATCH 67/88] itr : remove dep on commons lang3, fix visibility issues --- gradle/testing/defaults-tests.gradle | 2 +- lucene/sandbox/src/java/module-info.java | 9 ++- .../vectorsearch/CagraFieldVectorsWriter.java | 5 +- .../vectorsearch/CuVSVectorsFormat.java | 2 +- .../vectorsearch/CuVSVectorsReader.java | 7 +- .../vectorsearch/CuVSVectorsWriter.java | 1 - .../vectorsearch/FilterCuVSProvider.java | 2 +- .../FilterCuVSServiceProvider.java | 1 + .../vectorsearch/SerializationUtils.java | 64 +++++++++++++++++++ .../lucene/sandbox/vectorsearch/Util.java | 1 - .../services/org.apache.lucene.codecs.Codec | 2 +- 11 files changed, 79 insertions(+), 17 deletions(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index be0004b72378..b636162ea96d 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -143,7 +143,7 @@ allprojects { ':lucene:codecs', ":lucene:distribution.tests", ":lucene:test-framework" - ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core,com.nvidia.cuvs') + ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core,com.nvidia.cuvs') // TODO: make this sandbox only def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties") def tempDir = layout.projectDirectory.dir(testsTmpDir.toString()) diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 8b182a6e050c..822c06e1e431 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -22,7 +22,6 @@ requires org.apache.lucene.facet; requires java.logging; requires com.nvidia.cuvs; - requires org.apache.commons.lang3; exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; @@ -41,10 +40,10 @@ provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; - // provides org.apache.lucene.codecs.KnnVectorsFormat with - // org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; - provides org.apache.lucene.codecs.Codec with - org.apache.lucene.sandbox.vectorsearch.CuVSCodec; + provides org.apache.lucene.codecs.KnnVectorsFormat with + org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; +// provides org.apache.lucene.codecs.Codec with +// org.apache.lucene.sandbox.vectorsearch.CuVSCodec; provides com.nvidia.cuvs.spi.CuVSServiceProvider with org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index e712d69c1ef1..66718be698be 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.index.FieldInfo; @@ -37,9 +38,9 @@ public CagraFieldVectorsWriter(FieldInfo fieldInfo) { @Override public long ramBytesUsed() { - return fieldName.getBytes(Charset.forName("UTF-8")).length + return fieldName.getBytes(StandardCharsets.UTF_8).length + Integer.BYTES - + (vectors.size() * fieldVectorDimension * Float.BYTES); + + ((long) vectors.size() * fieldVectorDimension * Float.BYTES); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index dfc224bf6309..525dfe0eeb00 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -25,7 +25,7 @@ import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; /** CuVS based KnnVectorsFormat for GPU acceleration */ -/*package-private*/ class CuVSVectorsFormat extends KnnVectorsFormat { +public class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; public static final String VECTOR_DATA_EXTENSION = "cag"; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index b93d7113036d..4b41ef7f3bb4 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -37,7 +37,6 @@ import java.util.stream.Stream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import org.apache.commons.lang3.SerializationUtils; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.index.ByteVectorValues; @@ -141,18 +140,18 @@ private Map> loadCuVSIndex(ZipInputStream zis, boolean i switch (extension) { case "meta": { - maxDocs = (Map) SerializationUtils.deserialize(baos.toByteArray()); + maxDocs = SerializationUtils.deserialize(baos.toByteArray()); break; } case "vec": { vectors.put( - segmentField, (List) SerializationUtils.deserialize(baos.toByteArray())); + segmentField, SerializationUtils.deserialize(baos.toByteArray())); break; } case "map": { - List map = (List) SerializationUtils.deserialize(baos.toByteArray()); + List map = SerializationUtils.deserialize(baos.toByteArray()); mappings.put(segmentField, map); break; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 9de52248004f..3400d306cd49 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -30,7 +30,6 @@ import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; -import org.apache.commons.lang3.SerializationUtils; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsWriter; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java index 155d9301ab36..842fdde65dd2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java @@ -23,7 +23,7 @@ import com.nvidia.cuvs.spi.CuVSProvider; import java.nio.file.Path; -public class FilterCuVSProvider implements CuVSProvider { +/*package-private*/ class FilterCuVSProvider implements CuVSProvider { private final CuVSProvider delegate; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java index 65dbf5d14737..eeb7b6895aa3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java @@ -19,6 +19,7 @@ import com.nvidia.cuvs.spi.CuVSProvider; import com.nvidia.cuvs.spi.CuVSServiceProvider; +/** A provider that creates instances of FilterCuVSProvider. */ public class FilterCuVSServiceProvider extends CuVSServiceProvider { @Override public CuVSProvider get(CuVSProvider builtinProvider) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java new file mode 100644 index 000000000000..5eaf12d83a24 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.Objects; + +/*package-private*/ class SerializationUtils { + + static byte[] serialize(final Serializable obj) { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(64 * 1024); + serialize(obj, baos); + return baos.toByteArray(); + } + + static void serialize(final Serializable obj, final OutputStream outputStream) { + Objects.requireNonNull(outputStream); + try (ObjectOutputStream out = new ObjectOutputStream(outputStream)) { + out.writeObject(obj); + } catch (final IOException ex) { + throw new UncheckedIOException(ex); + } + } + + static T deserialize(final byte[] objectData) { + Objects.requireNonNull(objectData); + return deserialize(new ByteArrayInputStream(objectData)); + } + + static T deserialize(final InputStream inputStream) { + Objects.requireNonNull(inputStream); + try (ObjectInputStream in = new ObjectInputStream(inputStream)) { + @SuppressWarnings("unchecked") + final T obj = (T) in.readObject(); + return obj; + } catch (IOException ex) { + throw new UncheckedIOException(ex); + } catch (ClassNotFoundException ex) { + throw new AssertionError(ex); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java index a19e7d4681a5..ba980777b2df 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -23,7 +23,6 @@ import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import org.apache.commons.lang3.SerializationUtils; /** Some Utils used in CuVS integration */ /*package-private*/ class Util { diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 6f0a89e365d1..d039758f2603 100644 --- a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file +#org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file From e62112ec6235afec2a6f71aff5cd73a25be0b1c8 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 10:00:12 +0000 Subject: [PATCH 68/88] tidy --- .../vectorsearch/SerializationUtils.java | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java index 5eaf12d83a24..a46db32afea9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java @@ -29,36 +29,36 @@ /*package-private*/ class SerializationUtils { - static byte[] serialize(final Serializable obj) { - final ByteArrayOutputStream baos = new ByteArrayOutputStream(64 * 1024); - serialize(obj, baos); - return baos.toByteArray(); - } + static byte[] serialize(final Serializable obj) { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(64 * 1024); + serialize(obj, baos); + return baos.toByteArray(); + } - static void serialize(final Serializable obj, final OutputStream outputStream) { - Objects.requireNonNull(outputStream); - try (ObjectOutputStream out = new ObjectOutputStream(outputStream)) { - out.writeObject(obj); - } catch (final IOException ex) { - throw new UncheckedIOException(ex); - } + static void serialize(final Serializable obj, final OutputStream outputStream) { + Objects.requireNonNull(outputStream); + try (ObjectOutputStream out = new ObjectOutputStream(outputStream)) { + out.writeObject(obj); + } catch (final IOException ex) { + throw new UncheckedIOException(ex); } + } - static T deserialize(final byte[] objectData) { - Objects.requireNonNull(objectData); - return deserialize(new ByteArrayInputStream(objectData)); - } + static T deserialize(final byte[] objectData) { + Objects.requireNonNull(objectData); + return deserialize(new ByteArrayInputStream(objectData)); + } - static T deserialize(final InputStream inputStream) { - Objects.requireNonNull(inputStream); - try (ObjectInputStream in = new ObjectInputStream(inputStream)) { - @SuppressWarnings("unchecked") - final T obj = (T) in.readObject(); - return obj; - } catch (IOException ex) { - throw new UncheckedIOException(ex); - } catch (ClassNotFoundException ex) { - throw new AssertionError(ex); - } + static T deserialize(final InputStream inputStream) { + Objects.requireNonNull(inputStream); + try (ObjectInputStream in = new ObjectInputStream(inputStream)) { + @SuppressWarnings("unchecked") + final T obj = (T) in.readObject(); + return obj; + } catch (IOException ex) { + throw new UncheckedIOException(ex); + } catch (ClassNotFoundException ex) { + throw new AssertionError(ex); } + } } From 349c7aa30aa047c56809e57e95243f371a91a2e8 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 10:02:13 +0000 Subject: [PATCH 69/88] expose knn format and update test --- lucene/sandbox/src/java/module-info.java | 4 +--- .../vectorsearch/CagraFieldVectorsWriter.java | 1 - .../vectorsearch/CuVSVectorsFormat.java | 18 +++++++++--------- .../vectorsearch/CuVSVectorsReader.java | 3 +-- .../services/org.apache.lucene.codecs.Codec | 16 ---------------- .../lucene/sandbox/vectorsearch/TestCuVS.java | 6 +++--- 6 files changed, 14 insertions(+), 34 deletions(-) delete mode 100644 lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index 822c06e1e431..59e89cfd0bf0 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -40,10 +40,8 @@ provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; - provides org.apache.lucene.codecs.KnnVectorsFormat with + provides org.apache.lucene.codecs.KnnVectorsFormat with org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; -// provides org.apache.lucene.codecs.Codec with -// org.apache.lucene.sandbox.vectorsearch.CuVSCodec; provides com.nvidia.cuvs.spi.CuVSServiceProvider with org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java index 66718be698be..183b3c87d431 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -17,7 +17,6 @@ package org.apache.lucene.sandbox.vectorsearch; import java.io.IOException; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.codecs.KnnFieldVectorsWriter; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 525dfe0eeb00..ef8e206fcd48 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -31,6 +31,10 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final String VECTOR_DATA_EXTENSION = "cag"; public static final String META_EXTENSION = "cagmf"; public static final int VERSION_CURRENT = 0; + public static final int DEFAULT_WRITER_THREADS = 1; + public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; + public static final int DEFAULT_GRAPH_DEGREE = 64; + public final int maxDimensions = 4096; public final int cuvsWriterThreads; public final int intGraphDegree; @@ -39,15 +43,11 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { public static CuVSResources resources; public CuVSVectorsFormat() { - super("CuVSVectorsFormat"); - this.cuvsWriterThreads = 1; - this.intGraphDegree = 128; - this.graphDegree = 64; - try { - resources = CuVSResources.create(); - } catch (Throwable e) { - throw new RuntimeException(e); - } + this( + DEFAULT_WRITER_THREADS, + DEFAULT_INTERMEDIATE_GRAPH_DEGREE, + DEFAULT_GRAPH_DEGREE, + MergeStrategy.NON_TRIVIAL_MERGE); } public CuVSVectorsFormat( diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index 4b41ef7f3bb4..0afbe18b278e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -145,8 +145,7 @@ private Map> loadCuVSIndex(ZipInputStream zis, boolean i } case "vec": { - vectors.put( - segmentField, SerializationUtils.deserialize(baos.toByteArray())); + vectors.put(segmentField, SerializationUtils.deserialize(baos.toByteArray())); break; } case "map": diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec deleted file mode 100644 index d039758f2603..000000000000 --- a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java index 70325a3aa294..dd013eed547a 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -66,7 +66,7 @@ public class TestCuVS extends LuceneTestCase { public static void beforeClass() throws Exception { directory = newDirectory(); - Codec codec = new CuVSCodec(); + Codec codec = TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); RandomIndexWriter writer = new RandomIndexWriter( @@ -105,8 +105,8 @@ public static void beforeClass() throws Exception { @AfterClass public static void afterClass() throws Exception { - reader.close(); - directory.close(); + if (reader != null) reader.close(); + if (directory != null) directory.close(); searcher = null; reader = null; directory = null; From 8d8db0b87c1ed02bcfc2f9aed9579c8eb7ac90b8 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 11:52:15 +0000 Subject: [PATCH 70/88] fix initialization of cuvSResources --- .../vectorsearch/CuVSVectorsFormat.java | 57 +++++++++++++++---- .../lucene/sandbox/vectorsearch/TestCuVS.java | 20 +++---- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index ef8e206fcd48..1a20913f312b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -19,6 +19,7 @@ import com.nvidia.cuvs.CuVSResources; import com.nvidia.cuvs.LibraryException; import java.io.IOException; +import java.util.logging.Logger; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -27,6 +28,8 @@ /** CuVS based KnnVectorsFormat for GPU acceleration */ public class CuVSVectorsFormat extends KnnVectorsFormat { + private static final Logger LOG = Logger.getLogger(CuVSVectorsFormat.class.getName()); + public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; public static final String VECTOR_DATA_EXTENSION = "cag"; public static final String META_EXTENSION = "cagmf"; @@ -35,12 +38,13 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; public static final int DEFAULT_GRAPH_DEGREE = 64; - public final int maxDimensions = 4096; - public final int cuvsWriterThreads; - public final int intGraphDegree; - public final int graphDegree; - public MergeStrategy mergeStrategy; - public static CuVSResources resources; + static CuVSResources resources = cuVSResourcesOrNull(); + + final int maxDimensions = 4096; + final int cuvsWriterThreads; + final int intGraphDegree; + final int graphDegree; + final MergeStrategy mergeStrategy; public CuVSVectorsFormat() { this( @@ -58,23 +62,44 @@ public CuVSVectorsFormat( this.cuvsWriterThreads = cuvsWriterThreads; this.intGraphDegree = intGraphDegree; this.graphDegree = graphDegree; + } + + private static CuVSResources cuVSResourcesOrNull() { try { resources = CuVSResources.create(); - } catch (LibraryException ex) { - throw ex; - } catch (Throwable e) { - throw new RuntimeException(e); + return resources; + } catch (UnsupportedOperationException uoe) { + LOG.warning("cuvs is not supported on this platform or java version"); + } catch (Throwable t) { + if (t instanceof ExceptionInInitializerError ex) { + t = ex.getCause(); + } + LOG.warning("Exception occurred during creation of cuvs resources. " + t); + } + return null; + } + + /** Tells whether the platform supports cuvs. */ + public static boolean supported() { + return resources != null; + } + + private static void checkSupported() { + if (!supported()) { + throw new UnsupportedOperationException(); } } @Override public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + checkSupported(); return new CuVSVectorsWriter( state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); } @Override public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException { + checkSupported(); try { return new CuVSVectorsReader(state, resources); } catch (Throwable e) { @@ -86,4 +111,16 @@ public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException public int getMaxDimensions(String fieldName) { return maxDimensions; } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("CuVSVectorsFormat("); + sb.append("cuvsWriterThreads=").append(cuvsWriterThreads); + sb.append("intGraphDegree=").append(intGraphDegree); + sb.append("graphDegree=").append(graphDegree); + sb.append("mergeStrategy=").append(mergeStrategy); + sb.append("resources=").append(resources); + sb.append(")"); + return sb.toString(); + } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java index dd013eed547a..57be29050441 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -51,23 +51,23 @@ public class TestCuVS extends LuceneTestCase { protected static Logger log = Logger.getLogger(TestCuVS.class.getName()); - private static IndexSearcher searcher; - private static IndexReader reader; - private static Directory directory; + static final Codec codec = TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); + static IndexSearcher searcher; + static IndexReader reader; + static Directory directory; - public static int DATASET_SIZE_LIMIT = 1000; - public static int DIMENSIONS_LIMIT = 2048; - public static int NUM_QUERIES_LIMIT = 10; - public static int TOP_K_LIMIT = 64; // TODO This fails beyond 64 + static int DATASET_SIZE_LIMIT = 1000; + static int DIMENSIONS_LIMIT = 2048; + static int NUM_QUERIES_LIMIT = 10; + static int TOP_K_LIMIT = 64; // TODO This fails beyond 64 - public static float[][] dataset = null; + public static float[][] dataset; @BeforeClass public static void beforeClass() throws Exception { + assumeTrue("cuvs not supported", CuVSVectorsFormat.supported()); directory = newDirectory(); - Codec codec = TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); - RandomIndexWriter writer = new RandomIndexWriter( random(), From c9d454d5b91f8862e4ab803d19e40a53603fc84e Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 12:51:03 +0000 Subject: [PATCH 71/88] add CuVSVectorsFormat test --- .../vectorsearch/TestCuVSVectorsFormat.java | 42 +++++++++++++++++++ .../index/BaseKnnVectorsFormatTestCase.java | 17 ++++++-- 2 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java new file mode 100644 index 000000000000..ae5b2403a3e5 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.util.List; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.junit.BeforeClass; + +public class TestCuVSVectorsFormat extends BaseKnnVectorsFormatTestCase { + + @BeforeClass + public static void beforeClass() { + assumeTrue("cuvs is not supported", CuVSVectorsFormat.supported()); + } + + @Override + protected Codec getCodec() { + return TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); + } + + @Override + protected List supportedVectorEncodings() { + return List.of(VectorEncoding.FLOAT32); + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 97b578e7c5cd..ed1a76133968 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -296,6 +296,7 @@ public KnnVectorsFormat knnVectorsFormat() { } public void testMergingWithDifferentByteKnnFields() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); try (var dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(); Codec codec = getCodec(); @@ -994,6 +995,7 @@ public void testFloatVectorScorerIteration() throws Exception { } public void testByteVectorScorerIteration() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); if (random().nextBoolean()) { iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); @@ -1081,6 +1083,7 @@ public void testEmptyFloatVectorData() throws Exception { } public void testEmptyByteVectorData() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { var doc1 = new Document(); @@ -1112,11 +1115,16 @@ protected VectorSimilarityFunction randomSimilarity() { } /** - * This method is overrideable since old codec versions only support {@link - * VectorEncoding#FLOAT32}. + * The vector encodings supported by the format. Defaults to all VectorEncoding.values(). Override + * if the format only supports a subset of these encodings. */ + protected List supportedVectorEncodings() { + return Arrays.stream(VectorEncoding.values()).toList(); + } + protected VectorEncoding randomVectorEncoding() { - return VectorEncoding.values()[random().nextInt(VectorEncoding.values().length)]; + var encodings = supportedVectorEncodings().toArray(VectorEncoding[]::new); + return encodings[random().nextInt(encodings.length)]; } public void testIndexedValueNotAliased() throws Exception { @@ -1193,6 +1201,7 @@ public void testSortedIndex() throws Exception { } public void testSortedIndexBytes() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); String fieldName = "field"; @@ -1361,6 +1370,7 @@ public void testRandom() throws Exception { * back consistently. */ public void testRandomBytes() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); if (random().nextBoolean()) { iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); @@ -1875,6 +1885,7 @@ public void testVectorValuesReportCorrectDocs() throws Exception { } public void testMismatchedFields() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); Directory dir1 = newDirectory(); IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); Document doc = new Document(); From ab6beaedb2a16c3718574193f2918515242dd32c Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 12:55:16 +0000 Subject: [PATCH 72/88] fix testWriterRamEstimate --- .../vectorsearch/CuVSVectorsWriter.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 3400d306cd49..3f8301e68119 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -16,6 +16,8 @@ */ package org.apache.lucene.sandbox.vectorsearch; +import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; + import com.nvidia.cuvs.BruteForceIndex; import com.nvidia.cuvs.BruteForceIndexParams; import com.nvidia.cuvs.CagraIndex; @@ -45,6 +47,8 @@ /** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ /*package-private*/ class CuVSVectorsWriter extends KnnVectorsWriter { + private static final long SHALLOW_RAM_BYTES_USED = shallowSizeOfInstance(CuVSVectorsWriter.class); + // protected Logger log = Logger.getLogger(getClass().getName()); private List fieldVectorWriters = new ArrayList<>(); @@ -90,11 +94,6 @@ public CuVSVectorsWriter( CuVSVectorsFormat.VECTOR_DATA_EXTENSION); } - @Override - public long ramBytesUsed() { - return 0; - } - @Override public void close() throws IOException { IOUtils.close(cuVSIndex); @@ -367,6 +366,15 @@ public void finish() throws IOException { } } + @Override + public long ramBytesUsed() { + long total = SHALLOW_RAM_BYTES_USED; + for (var field : fieldVectorWriters) { + total += field.ramBytesUsed(); + } + return total; + } + /** OutputStream for writing into an IndexOutput */ public class SegmentOutputStream extends OutputStream { From 98686ecb27fa79a21926c90648bfba59371eb2b8 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Wed, 5 Feb 2025 13:04:36 +0000 Subject: [PATCH 73/88] This chanage rewrites the cuVS format implementation. After the rewrite all the BaseKnnVectorsFormatTestCase tests pass. There are still some lurking intermittent failures, but the tests pass successfully the majority of the time. Summary of the most significant changes: 1. Use the flat vectors reader/writer to support the raw float32 vectors and ordinal to docId mapping. This is similar to how HNSW is supported in Lucene. And keeps the code aligned with how other formats are layered atop each other. 2. The cuVS indices (Cagra, brute force, and HNSW) are stored directly in the format, so can be mmap'ed directly. 3. Merges are physical, all raw vectors are retrieved and used to create new cuVS indices. 4. A standard KnnCollector is used, no need for a special one for cuVS, unless one wants to customise some very specific parameters. A number of workarounds have been put in place, which will eventually be lifted. 1. pre-filter and deleted docs over sample the topK, since the cuvs-java do not yet support a pre-filter. 2. Ignore Cagra failures indexing with small numbers of docs, fail over to just brute force. --- .../vectorsearch/CagraFieldVectorsWriter.java | 54 -- .../sandbox/vectorsearch/CuVSFieldWriter.java | 80 +++ .../sandbox/vectorsearch/CuVSIndex.java | 32 +- .../vectorsearch/CuVSVectorsFormat.java | 42 +- .../vectorsearch/CuVSVectorsReader.java | 621 +++++++++++------- .../vectorsearch/CuVSVectorsWriter.java | 579 ++++++++-------- .../vectorsearch/IndexInputInputStream.java | 60 ++ .../vectorsearch/IndexOutputOutputStream.java | 70 ++ .../vectorsearch/SegmentInputStream.java | 105 --- .../vectorsearch/SerializationUtils.java | 64 -- .../lucene/sandbox/vectorsearch/Util.java | 82 --- .../lucene/sandbox/vectorsearch/TestCuVS.java | 4 +- .../vectorsearch/TestCuVSVectorsFormat.java | 89 +++ .../TestIndexOutputOutputStream.java | 102 +++ 14 files changed, 1134 insertions(+), 850 deletions(-) delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java deleted file mode 100644 index 183b3c87d431..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.vectorsearch; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.ConcurrentHashMap; -import org.apache.lucene.codecs.KnnFieldVectorsWriter; -import org.apache.lucene.index.FieldInfo; - -/** CuVS based fields writer */ -/*package-private*/ class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { - - public final String fieldName; - public final ConcurrentHashMap vectors = - new ConcurrentHashMap(); - public int fieldVectorDimension = -1; - - public CagraFieldVectorsWriter(FieldInfo fieldInfo) { - this.fieldName = fieldInfo.getName(); - this.fieldVectorDimension = fieldInfo.getVectorDimension(); - } - - @Override - public long ramBytesUsed() { - return fieldName.getBytes(StandardCharsets.UTF_8).length - + Integer.BYTES - + ((long) vectors.size() * fieldVectorDimension * Float.BYTES); - } - - @Override - public void addValue(int docID, float[] vectorValue) throws IOException { - vectors.put(docID, vectorValue); - } - - @Override - public float[] copyValue(float[] vectorValue) { - throw new UnsupportedOperationException(); - } -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java new file mode 100644 index 000000000000..61b8f0879202 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.util.List; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; +import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.util.RamUsageEstimator; + +/** CuVS based fields writer */ +/*package-private*/ class CuVSFieldWriter extends KnnFieldVectorsWriter { + + private static final long SHALLOW_SIZE = + RamUsageEstimator.shallowSizeOfInstance(CuVSFieldWriter.class); + + private final FieldInfo fieldInfo; + private final FlatFieldVectorsWriter flatFieldVectorsWriter; + private int lastDocID = -1; + + public CuVSFieldWriter( + FieldInfo fieldInfo, FlatFieldVectorsWriter flatFieldVectorsWriter) { + this.fieldInfo = fieldInfo; + this.flatFieldVectorsWriter = flatFieldVectorsWriter; + } + + @Override + public void addValue(int docID, float[] vectorValue) throws IOException { + if (docID == lastDocID) { + throw new IllegalArgumentException( + "VectorValuesField \"" + + fieldInfo.name + + "\" appears more than once in this document (only one value is allowed per field)"); + } + flatFieldVectorsWriter.addValue(docID, vectorValue); + } + + List getVectors() { + return flatFieldVectorsWriter.getVectors(); + } + + FieldInfo fieldInfo() { + return fieldInfo; + } + + DocsWithFieldSet getDocsWithFieldSet() { + return flatFieldVectorsWriter.getDocsWithFieldSet(); + } + + @Override + public float[] copyValue(float[] vectorValue) { + throw new UnsupportedOperationException(); + } + + @Override + public long ramBytesUsed() { + return SHALLOW_SIZE + flatFieldVectorsWriter.ramBytesUsed(); + } + + @Override + public String toString() { + return "CuVSFieldWriter[field name=" + fieldInfo.name + ", number=" + fieldInfo.number + "]"; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 7b8c19996195..0356d53780d1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -18,38 +18,40 @@ import com.nvidia.cuvs.BruteForceIndex; import com.nvidia.cuvs.CagraIndex; -import java.util.List; +import com.nvidia.cuvs.HnswIndex; import java.util.Objects; /** This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) */ -/*package-private*/ class CuVSIndex { +public class CuVSIndex { private final CagraIndex cagraIndex; private final BruteForceIndex bruteforceIndex; - private final List mapping; - private final List vectors; - private final int maxDocs; + private final HnswIndex hnswIndex; - private final String fieldName; - private final String segmentName; + private int maxDocs; + private String fieldName; + private String segmentName; public CuVSIndex( String segmentName, String fieldName, CagraIndex cagraIndex, - List mapping, - List vectors, int maxDocs, BruteForceIndex bruteforceIndex) { this.cagraIndex = Objects.requireNonNull(cagraIndex); this.bruteforceIndex = Objects.requireNonNull(bruteforceIndex); - this.mapping = Objects.requireNonNull(mapping); - this.vectors = Objects.requireNonNull(vectors); this.fieldName = Objects.requireNonNull(fieldName); this.segmentName = Objects.requireNonNull(segmentName); if (maxDocs < 0) { throw new IllegalArgumentException("negative maxDocs:" + maxDocs); } this.maxDocs = maxDocs; + this.hnswIndex = null; // TODO: + } + + public CuVSIndex(CagraIndex cagraIndex, BruteForceIndex bruteforceIndex, HnswIndex hnswIndex) { + this.cagraIndex = cagraIndex; + this.bruteforceIndex = bruteforceIndex; + this.hnswIndex = hnswIndex; } public CagraIndex getCagraIndex() { @@ -60,18 +62,14 @@ public BruteForceIndex getBruteforceIndex() { return bruteforceIndex; } - public List getMapping() { - return mapping; + public HnswIndex getHNSWIndex() { + return hnswIndex; } public String getFieldName() { return fieldName; } - public List getVectors() { - return vectors; - } - public String getSegmentName() { return segmentName; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 1a20913f312b..0e839bafe792 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -21,6 +21,9 @@ import java.io.IOException; import java.util.logging.Logger; import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; @@ -30,16 +33,29 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { private static final Logger LOG = Logger.getLogger(CuVSVectorsFormat.class.getName()); - public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; - public static final String VECTOR_DATA_EXTENSION = "cag"; - public static final String META_EXTENSION = "cagmf"; - public static final int VERSION_CURRENT = 0; + // TODO: fix Lucene version in name, to the final targeted release, if any + static final String CUVS_META_CODEC_NAME = "Lucene102CuVSVectorsFormatMeta"; + static final String CUVS_META_CODEC_EXT = "vemc"; // ""cagmf"; + static final String CUVS_INDEX_CODEC_NAME = "Lucene102CuVSVectorsFormatIndex"; + static final String CUVS_INDEX_EXT = "vcag"; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + public static final int DEFAULT_WRITER_THREADS = 1; public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; public static final int DEFAULT_GRAPH_DEGREE = 64; + // The minimum number of vectors in the dataset required before + // we attempt to build a Cagra index + static final int MIN_CAGRA_INDEX_SIZE = 2; + static CuVSResources resources = cuVSResourcesOrNull(); + /** The format for storing, reading, and merging raw vectors on disk. */ + private static final FlatVectorsFormat flatVectorsFormat = + new Lucene99FlatVectorsFormat(DefaultFlatVectorScorer.INSTANCE); + final int maxDimensions = 4096; final int cuvsWriterThreads; final int intGraphDegree; @@ -69,7 +85,7 @@ private static CuVSResources cuVSResourcesOrNull() { resources = CuVSResources.create(); return resources; } catch (UnsupportedOperationException uoe) { - LOG.warning("cuvs is not supported on this platform or java version"); + LOG.warning("cuvs is not supported on this platform or java version: " + uoe.getMessage()); } catch (Throwable t) { if (t instanceof ExceptionInInitializerError ex) { t = ex.getCause(); @@ -93,18 +109,22 @@ private static void checkSupported() { @Override public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { checkSupported(); + var flatWriter = flatVectorsFormat.fieldsWriter(state); return new CuVSVectorsWriter( - state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); + state, + cuvsWriterThreads, + intGraphDegree, + graphDegree, + mergeStrategy, + resources, + flatWriter); } @Override public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException { checkSupported(); - try { - return new CuVSVectorsReader(state, resources); - } catch (Throwable e) { - throw new RuntimeException(e); - } + var flatReader = flatVectorsFormat.fieldsReader(state); + return new CuVSVectorsReader(state, resources, flatReader); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index 0afbe18b278e..07b44854f7c2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -16,6 +16,13 @@ */ package org.apache.lucene.sandbox.vectorsearch; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_START; + import com.nvidia.cuvs.BruteForceIndex; import com.nvidia.cuvs.BruteForceQuery; import com.nvidia.cuvs.CagraIndex; @@ -24,207 +31,252 @@ import com.nvidia.cuvs.CuVSResources; import com.nvidia.cuvs.HnswIndex; import com.nvidia.cuvs.HnswIndexParams; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.lang.StackWalker.StackFrame; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Map.Entry; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; +import java.util.logging.Logger; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.hnsw.FlatVectorsReader; import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.search.KnnCollector; -import org.apache.lucene.search.TopKnnCollector; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.hnsw.IntToIntFunction; /** KnnVectorsReader instance associated with CuVS format */ -/*package-private*/ class CuVSVectorsReader extends KnnVectorsReader { - - // protected Logger log = Logger.getLogger(getClass().getName()); - - IndexInput vectorDataReader = null; - public String fileName = null; - public byte[] indexFileBytes; - public int[] docIds; - public float[] vectors; - public SegmentReadState segmentState = null; - public int indexFilePayloadSize = 0; - public long initialFilePointerLoc = 0; - public SegmentInputStream segmentInputStream; +public class CuVSVectorsReader extends KnnVectorsReader { - // Field to List of Indexes - public Map> cuvsIndexes; + @SuppressWarnings("unused") + private static final Logger log = Logger.getLogger(CuVSVectorsReader.class.getName()); - private CuVSResources resources; + private final CuVSResources resources; + private final FlatVectorsReader flatVectorsReader; // for reading the raw vectors + private final FieldInfos fieldInfos; + private final IntObjectHashMap fields; + private final IntObjectHashMap cuvsIndices; + private final IndexInput cuvsIndexInput; - public CuVSVectorsReader(SegmentReadState state, CuVSResources resources) throws Throwable { - - segmentState = state; + public CuVSVectorsReader( + SegmentReadState state, CuVSResources resources, FlatVectorsReader flatReader) + throws IOException { this.resources = resources; + this.flatVectorsReader = flatReader; + this.fieldInfos = state.fieldInfos; + this.fields = new IntObjectHashMap<>(); - fileName = + String metaFileName = IndexFileNames.segmentFileName( - state.segmentInfo.name, state.segmentSuffix, CuVSVectorsFormat.VECTOR_DATA_EXTENSION); - - vectorDataReader = segmentState.directory.openInput(fileName, segmentState.context); - CodecUtil.readIndexHeader(vectorDataReader); - - initialFilePointerLoc = vectorDataReader.getFilePointer(); - indexFilePayloadSize = - (int) vectorDataReader.length() - - (int) initialFilePointerLoc; // vectorMetaReader.readInt(); - segmentInputStream = - new SegmentInputStream(vectorDataReader, indexFilePayloadSize, initialFilePointerLoc); - // log.info("payloadSize: " + indexFilePayloadSize); - // log.info("initialFilePointerLoc: " + initialFilePointerLoc); - - List stackTrace = StackWalker.getInstance().walk(this::getStackTrace); - - boolean isMergeCase = false; - for (StackFrame s : stackTrace) { - if (s.toString().startsWith("org.apache.lucene.index.IndexWriter.merge")) { - isMergeCase = true; - // log.info("Reader opening on merge call"); - break; + state.segmentInfo.name, state.segmentSuffix, CUVS_META_CODEC_EXT); + boolean success = false; + int versionMeta = -1; + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { + Throwable priorException = null; + try { + versionMeta = + CodecUtil.checkIndexHeader( + meta, + CUVS_META_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + readFields(meta); + } catch (Throwable exception) { + priorException = exception; + } finally { + CodecUtil.checkFooter(meta, priorException); + } + var ioContext = state.context.withReadAdvice(ReadAdvice.SEQUENTIAL); + cuvsIndexInput = openCuVSInput(state, versionMeta, ioContext); + cuvsIndices = loadCuVSIndices(); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); } } - - /*log.info( - "Source of this segment " - + segmentState.segmentSuffix - + " is " - + segmentState.segmentInfo.getDiagnostics().get(IndexWriter.SOURCE)); - log.info("Loading for " + segmentState.segmentInfo.name + ", mergeCase? " + isMergeCase); - log.info("Not the merge case, hence loading for " + segmentState.segmentInfo.name);*/ - this.cuvsIndexes = loadCuVSIndex(getIndexInputStream(), isMergeCase); } - @SuppressWarnings({"unchecked"}) - private Map> loadCuVSIndex(ZipInputStream zis, boolean isMergeCase) - throws Throwable { - Map> ret = new HashMap>(); - Map cagraIndexes = new HashMap(); - Map bruteforceIndexes = new HashMap(); - Map hnswIndexes = new HashMap(); - Map> mappings = new HashMap>(); - Map> vectors = new HashMap>(); - - Map maxDocs = null; // map of segment, maxDocs - ZipEntry ze; - while ((ze = zis.getNextEntry()) != null) { - String entry = ze.getName(); - - String segmentField = entry.split("\\.")[0]; - String extension = entry.split("\\.")[1]; - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - byte[] buffer = new byte[1024]; - int len = 0; - while ((len = zis.read(buffer)) != -1) { - baos.write(buffer, 0, len); + private static IndexInput openCuVSInput( + SegmentReadState state, int versionMeta, IOContext context) throws IOException { + String fileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, CUVS_INDEX_EXT); + IndexInput in = state.directory.openInput(fileName, context); + boolean success = false; + try { + int versionVectorData = + CodecUtil.checkIndexHeader( + in, + CUVS_INDEX_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + checkVersion(versionMeta, versionVectorData, in); + CodecUtil.retrieveChecksum(in); + success = true; + return in; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(in); } + } + } + + private void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) { + int dimension = info.getVectorDimension(); + if (dimension != fieldEntry.dims()) { + throw new IllegalStateException( + "Inconsistent vector dimension for field=\"" + + info.name + + "\"; " + + dimension + + " != " + + fieldEntry.dims()); + } + } - switch (extension) { - case "meta": - { - maxDocs = SerializationUtils.deserialize(baos.toByteArray()); - break; - } - case "vec": - { - vectors.put(segmentField, SerializationUtils.deserialize(baos.toByteArray())); - break; - } - case "map": - { - List map = SerializationUtils.deserialize(baos.toByteArray()); - mappings.put(segmentField, map); - break; - } - case "cag": - { - cagraIndexes.put( - segmentField, - CagraIndex.newBuilder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .build()); - break; - } - case "bf": - { - bruteforceIndexes.put( - segmentField, - BruteForceIndex.newBuilder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .build()); - break; - } - case "hnsw": - { - HnswIndexParams indexParams = new HnswIndexParams.Builder().build(); - hnswIndexes.put( - segmentField, - HnswIndex.newBuilder(resources) - .from(new ByteArrayInputStream(baos.toByteArray())) - .withIndexParams(indexParams) - .build()); - break; - } + private void readFields(ChecksumIndexInput meta) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { + FieldInfo info = fieldInfos.fieldInfo(fieldNumber); + if (info == null) { + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } + FieldEntry fieldEntry = readField(meta, info); + validateFieldEntry(info, fieldEntry); + fields.put(info.number, fieldEntry); + } + } + + // List of vector similarity functions. This list is defined here, in order + // to avoid an undesirable dependency on the declaration and order of values + // in VectorSimilarityFunction. The list values and order must be identical + // to that of {@link o.a.l.c.l.Lucene94FieldInfosFormat#SIMILARITY_FUNCTIONS}. + static final List SIMILARITY_FUNCTIONS = + List.of( + VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT); + + static VectorSimilarityFunction readSimilarityFunction(DataInput input) throws IOException { + int i = input.readInt(); + if (i < 0 || i >= SIMILARITY_FUNCTIONS.size()) { + throw new IllegalArgumentException("invalid distance function: " + i); + } + return SIMILARITY_FUNCTIONS.get(i); + } + + static VectorEncoding readVectorEncoding(DataInput input) throws IOException { + int encodingId = input.readInt(); + if (encodingId < 0 || encodingId >= VectorEncoding.values().length) { + throw new CorruptIndexException("Invalid vector encoding id: " + encodingId, input); } + return VectorEncoding.values()[encodingId]; + } - /*log.info("Loading cuvsIndexes from segment: " + segmentState.segmentInfo.name); - log.info("Diagnostics for this segment: " + segmentState.segmentInfo.getDiagnostics()); - log.info("Loading map of cagraIndexes: " + cagraIndexes); - log.info("Loading vectors: " + vectors); - log.info("Loading mapping: " + mappings);*/ - - for (String segmentField : cagraIndexes.keySet()) { - // log.info("Loading segmentField: " + segmentField); - String segment = segmentField.split("/")[0]; - String field = segmentField.split("/")[1]; - CuVSIndex cuvsIndex = - new CuVSIndex( - segment, - field, - cagraIndexes.get(segmentField), - mappings.get(segmentField), - vectors.get(segmentField), - maxDocs.get(segment), - bruteforceIndexes.get(segmentField)); - List listOfIndexes = - ret.containsKey(field) ? ret.get(field) : new ArrayList(); - listOfIndexes.add(cuvsIndex); - ret.put(field, listOfIndexes); + private FieldEntry readField(IndexInput input, FieldInfo info) throws IOException { + VectorEncoding vectorEncoding = readVectorEncoding(input); + VectorSimilarityFunction similarityFunction = readSimilarityFunction(input); + if (similarityFunction != info.getVectorSimilarityFunction()) { + throw new IllegalStateException( + "Inconsistent vector similarity function for field=\"" + + info.name + + "\"; " + + similarityFunction + + " != " + + info.getVectorSimilarityFunction()); } - return ret; + return FieldEntry.readEntry(input, vectorEncoding, info.getVectorSimilarityFunction()); } - public List getStackTrace(Stream stackFrameStream) { - return stackFrameStream.collect(Collectors.toList()); + private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + if (fieldEntry.vectorEncoding != expectedEncoding) { + throw new IllegalArgumentException( + "field=\"" + + field + + "\" is encoded as: " + + fieldEntry.vectorEncoding + + " expected: " + + expectedEncoding); + } + return fieldEntry; } - public ZipInputStream getIndexInputStream() throws IOException { - segmentInputStream.reset(); - return new ZipInputStream(segmentInputStream); + private IntObjectHashMap loadCuVSIndices() throws IOException { + var indices = new IntObjectHashMap(); + for (var e : fields) { + var fieldEntry = e.value; + int fieldNumber = e.key; + var cuvsIndex = loadCuVSIndex(fieldEntry); + indices.put(fieldNumber, cuvsIndex); + } + return indices; + } + + private CuVSIndex loadCuVSIndex(FieldEntry fieldEntry) throws IOException { + CagraIndex cagraIndex = null; + BruteForceIndex bruteForceIndex = null; + HnswIndex hnswIndex = null; + + try { + long len = fieldEntry.cagraIndexLength(); + if (len > 0) { + long off = fieldEntry.cagraIndexOffset(); + try (var slice = cuvsIndexInput.slice("cagra index", off, len); + var in = new IndexInputInputStream(slice)) { + cagraIndex = CagraIndex.newBuilder(resources).from(in).build(); + } + } + + len = fieldEntry.bruteForceIndexLength(); + if (len > 0) { + long off = fieldEntry.bruteForceIndexOffset(); + try (var slice = cuvsIndexInput.slice("bf index", off, len); + var in = new IndexInputInputStream(slice)) { + bruteForceIndex = BruteForceIndex.newBuilder(resources).from(in).build(); + } + } + + len = fieldEntry.hnswIndexLength(); + if (len > 0) { + long off = fieldEntry.hnswIndexOffset(); + try (var slice = cuvsIndexInput.slice("hnsw index", off, len); + var in = new IndexInputInputStream(slice)) { + var params = new HnswIndexParams.Builder().build(); + hnswIndex = HnswIndex.newBuilder(resources).withIndexParams(params).from(in).build(); + } + } + } catch (Throwable t) { + handleThrowable(t); + } + return new CuVSIndex(cagraIndex, bruteForceIndex, hnswIndex); } @Override public void close() throws IOException { - IOUtils.close(vectorDataReader); + IOUtils.close(flatVectorsReader, cuvsIndexInput); } @Override @@ -234,106 +286,189 @@ public void checkIntegrity() throws IOException { @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { - return new FloatVectorValues() { - - @Override - public int size() { - return cuvsIndexes.get(field).get(0).getVectors().size(); - } + return flatVectorsReader.getFloatVectorValues(field); + } - @Override - public int dimension() { - return cuvsIndexes.get(field).get(0).getVectors().get(0).length; - } + @Override + public ByteVectorValues getByteVectorValues(String field) { + throw new UnsupportedOperationException("byte vectors not supported"); + } - @Override - public float[] vectorValue(int pos) throws IOException { - return cuvsIndexes.get(field).get(0).getVectors().get(pos); - } + /** Native float to float function */ + public interface FloatToFloatFunction { + float apply(float v); + } - @Override - public FloatVectorValues copy() throws IOException { - return null; - } - }; + static long[] bitsToLongArray(Bits bits) { + if (bits instanceof FixedBitSet fixedBitSet) { + return fixedBitSet.getBits(); + } else { + return FixedBitSet.copyOf(bits).getBits(); + } } - @Override - public ByteVectorValues getByteVectorValues(String field) throws IOException { - throw new UnsupportedOperationException(); + static FloatToFloatFunction getScoreNormalizationFunc(VectorSimilarityFunction sim) { + // TODO: check for different similarities + return score -> (1f / (1f + score)); } + // This is a hack - replace with cuVS bugId/filter support + static final int FILTER_OVER_SAMPLE = 10; + @Override public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - PerLeafCuVSKnnCollector cuvsCollector = - knnCollector instanceof PerLeafCuVSKnnCollector - ? ((PerLeafCuVSKnnCollector) knnCollector) - : new PerLeafCuVSKnnCollector(knnCollector.k(), knnCollector.k(), 1); - TopKnnCollector defaultCollector = - knnCollector instanceof TopKnnCollector ? ((TopKnnCollector) knnCollector) : null; - - int prevDocCount = 0; - - // log.debug("Will try to search all the indexes for segment "+segmentState.segmentInfo.name+", - // field "+field+": "+cuvsIndexes); - for (CuVSIndex cuvsIndex : cuvsIndexes.get(field)) { - try { - Map result = new HashMap(); - if (cuvsCollector.k() <= 1024) { - CagraSearchParams searchParams = - new CagraSearchParams.Builder(resources) - .withItopkSize(cuvsCollector.iTopK) - .withSearchWidth(cuvsCollector.searchWidth) - .build(); - - CagraQuery query = - new CagraQuery.Builder() - .withTopK(cuvsCollector.k()) - .withSearchParams(searchParams) - .withMapping(cuvsIndex.getMapping()) - .withQueryVectors(new float[][] {target}) - .build(); - - CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); - assert (cagraIndex != null); - // log.info("k is " + cuvsCollector.k()); - result = - cagraIndex - .search(query) - .getResults() - .get(0); // List expected to have only one entry because of single query "target". - // log.info("INTERMEDIATE RESULT FROM CUVS: " + result + ", prevDocCount=" + - // prevDocCount); - } else { - BruteForceQuery bruteforceQuery = - new BruteForceQuery.Builder() - .withQueryVectors(new float[][] {target}) - .withPrefilter(((FixedBitSet) acceptDocs).getBits()) - .withTopK(cuvsCollector.k()) - .build(); - - BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); - result = bruteforceIndex.search(bruteforceQuery).getResults().get(0); - } + var fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); + if (fieldEntry.count() == 0 || knnCollector.k() == 0) { + return; + } - for (Entry kv : result.entrySet()) { - if (defaultCollector != null) { - defaultCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); - } - cuvsCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); - } + var fieldNumber = fieldInfos.fieldInfo(field).number; + // log.info("fieldNumber=" + fieldNumber + ", fieldEntry.count()=" + fieldEntry.count()); + + CuVSIndex cuvsIndex = cuvsIndices.get(fieldNumber); + if (cuvsIndex == null) { + throw new IllegalStateException("not index found for field:" + field); + } - } catch (Throwable e) { - throw new RuntimeException(e); + int collectorTopK = knnCollector.k(); + if (acceptDocs != null) { + collectorTopK = knnCollector.k() * FILTER_OVER_SAMPLE; + } + final int topK = Math.min(collectorTopK, fieldEntry.count()); + + Map result; + if (knnCollector.k() <= 1024 && cuvsIndex.getCagraIndex() != null) { + // log.info("searching cagra index"); + CagraSearchParams searchParams = + new CagraSearchParams.Builder(resources) + .withItopkSize(topK) // TODO: params + .withSearchWidth(1) + .build(); + + var query = + new CagraQuery.Builder() + .withTopK(topK) + .withSearchParams(searchParams) + .withMapping(null) + .withQueryVectors(new float[][] {target}) + .build(); + + CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); + List> searchResult = null; + try { + searchResult = cagraIndex.search(query).getResults(); + } catch (Throwable t) { + handleThrowable(t); + } + // List expected to have only one entry because of single query "target". + assert searchResult.size() == 1; + result = searchResult.getFirst(); + } else { + BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); + assert bruteforceIndex != null; + // log.info("searching brute index, with actual topK=" + topK); + var queryBuilder = + new BruteForceQuery.Builder().withQueryVectors(new float[][] {target}).withTopK(topK); + BruteForceQuery query = queryBuilder.build(); + + List> searchResult = null; + try { + searchResult = bruteforceIndex.search(query).getResults(); + } catch (Throwable t) { + handleThrowable(t); + } + assert searchResult.size() == 1; + result = searchResult.getFirst(); + } + assert result != null; + + final var rawValues = flatVectorsReader.getFloatVectorValues(field); + final Bits acceptedOrds = rawValues.getAcceptOrds(acceptDocs); + final var ordToDocFunction = (IntToIntFunction) rawValues::ordToDoc; + final var scoreCorrectionFunction = getScoreNormalizationFunc(fieldEntry.similarityFunction); + + for (var entry : result.entrySet()) { + int ord = entry.getKey(); + float score = entry.getValue(); + if (acceptedOrds == null || acceptedOrds.get(ord)) { + if (knnCollector.earlyTerminated()) { + break; + } + assert ord >= 0 : "unexpected ord: " + ord; + int doc = ordToDocFunction.apply(ord); + float correctedScore = scoreCorrectionFunction.apply(score); + knnCollector.incVisitedCount(1); + knnCollector.collect(doc, correctedScore); } - prevDocCount += cuvsIndex.getMaxDocs(); } } @Override public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException { - throw new UnsupportedOperationException(); + throw new UnsupportedOperationException("byte vectors not supported"); + } + + record FieldEntry( + VectorEncoding vectorEncoding, + VectorSimilarityFunction similarityFunction, + int dims, + int count, + long cagraIndexOffset, + long cagraIndexLength, + long bruteForceIndexOffset, + long bruteForceIndexLength, + long hnswIndexOffset, + long hnswIndexLength) { + + static FieldEntry readEntry( + IndexInput input, + VectorEncoding vectorEncoding, + VectorSimilarityFunction similarityFunction) + throws IOException { + var dims = input.readInt(); + var count = input.readInt(); + var cagraIndexOffset = input.readVLong(); + var cagraIndexLength = input.readVLong(); + var bruteForceIndexOffset = input.readVLong(); + var bruteForceIndexLength = input.readVLong(); + var hnswIndexOffset = input.readVLong(); + var hnswIndexLength = input.readVLong(); + return new FieldEntry( + vectorEncoding, + similarityFunction, + dims, + count, + cagraIndexOffset, + cagraIndexLength, + bruteForceIndexOffset, + bruteForceIndexLength, + hnswIndexOffset, + hnswIndexLength); + } + } + + static void checkVersion(int versionMeta, int versionVectorData, IndexInput in) + throws CorruptIndexException { + if (versionMeta != versionVectorData) { + throw new CorruptIndexException( + "Format versions mismatch: meta=" + + versionMeta + + ", " + + CUVS_META_CODEC_NAME + + "=" + + versionVectorData, + in); + } + } + + static void handleThrowable(Throwable t) throws IOException { + switch (t) { + case IOException ioe -> throw ioe; + case Error error -> throw error; + case RuntimeException re -> throw re; + case null, default -> throw new RuntimeException("UNEXPECTED: exception type", t); + } } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 3f8301e68119..013ee0f40433 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -16,6 +16,15 @@ */ package org.apache.lucene.sandbox.vectorsearch; +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS; +import static org.apache.lucene.index.VectorEncoding.FLOAT32; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.MIN_CAGRA_INDEX_SIZE; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; import com.nvidia.cuvs.BruteForceIndex; @@ -24,47 +33,55 @@ import com.nvidia.cuvs.CagraIndexParams; import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; import com.nvidia.cuvs.CuVSResources; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Duration; import java.util.ArrayList; -import java.util.LinkedHashMap; import java.util.List; +import java.util.Objects; +import java.util.logging.Logger; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; +import org.apache.lucene.index.DocsWithFieldSet; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter; import org.apache.lucene.index.Sorter.DocMap; +import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.SuppressForbidden; /** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ -/*package-private*/ class CuVSVectorsWriter extends KnnVectorsWriter { +public class CuVSVectorsWriter extends KnnVectorsWriter { private static final long SHALLOW_RAM_BYTES_USED = shallowSizeOfInstance(CuVSVectorsWriter.class); - // protected Logger log = Logger.getLogger(getClass().getName()); - - private List fieldVectorWriters = new ArrayList<>(); - private IndexOutput cuVSIndex = null; - private SegmentWriteState segmentWriteState = null; - private String cuVSDataFilename = null; - - private CagraIndex cagraIndex; - private CagraIndex cagraIndexForHnsw; + @SuppressWarnings("unused") + private static final Logger log = Logger.getLogger(CuVSVectorsWriter.class.getName()); private final int cuvsWriterThreads; private final int intGraphDegree; private final int graphDegree; + + @SuppressWarnings("unused") private final MergeStrategy mergeStrategy; + private final CuVSResources resources; + private final FlatVectorsWriter flatVectorsWriter; // for writing the raw vectors + private final List fields = new ArrayList<>(); + private final IndexOutput meta, cuvsIndex; + private boolean finished; + /** Merge strategy used for CuVS */ public enum MergeStrategy { TRIVIAL_MERGE, @@ -77,337 +94,353 @@ public CuVSVectorsWriter( int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy, - CuVSResources resources) + CuVSResources resources, + FlatVectorsWriter flatVectorsWriter) throws IOException { super(); - this.segmentWriteState = state; this.mergeStrategy = mergeStrategy; this.cuvsWriterThreads = cuvsWriterThreads; this.intGraphDegree = intGraphDegree; this.graphDegree = graphDegree; this.resources = resources; + this.flatVectorsWriter = flatVectorsWriter; - cuVSDataFilename = + String metaFileName = IndexFileNames.segmentFileName( - this.segmentWriteState.segmentInfo.name, - this.segmentWriteState.segmentSuffix, - CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + state.segmentInfo.name, state.segmentSuffix, CUVS_META_CODEC_EXT); + String cagraFileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, CUVS_INDEX_EXT); + + boolean success = false; + try { + meta = state.directory.createOutput(metaFileName, state.context); + cuvsIndex = state.directory.createOutput(cagraFileName, state.context); + CodecUtil.writeIndexHeader( + meta, + CUVS_META_CODEC_NAME, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + CodecUtil.writeIndexHeader( + cuvsIndex, + CUVS_INDEX_CODEC_NAME, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } } @Override - public void close() throws IOException { - IOUtils.close(cuVSIndex); - cuVSIndex = null; - fieldVectorWriters.clear(); - fieldVectorWriters = null; + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + var encoding = fieldInfo.getVectorEncoding(); + if (encoding != FLOAT32) { + throw new IllegalArgumentException("expected float32, got:" + encoding); + } + var writer = Objects.requireNonNull(flatVectorsWriter.addField(fieldInfo)); + @SuppressWarnings("unchecked") + var flatWriter = (FlatFieldVectorsWriter) writer; + var cuvsFieldWriter = new CuVSFieldWriter(fieldInfo, flatWriter); + fields.add(cuvsFieldWriter); + return writer; } - @Override - public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { - CagraFieldVectorsWriter cagraFieldVectorWriter = new CagraFieldVectorsWriter(fieldInfo); - fieldVectorWriters.add(cagraFieldVectorWriter); - return cagraFieldVectorWriter; + static String indexMsg(int size, int... args) { + StringBuilder sb = new StringBuilder("cagra index params"); + sb.append(": size=").append(size); + sb.append(", intGraphDegree=").append(args[0]); + sb.append(", actualIntGraphDegree=").append(args[1]); + sb.append(", graphDegree=").append(args[2]); + sb.append(", actualGraphDegree=").append(args[3]); + return sb.toString(); } - @SuppressForbidden(reason = "A temporary java.util.File is needed for Cagra's serialization") - private byte[] createCagraIndex(float[][] vectors, List mapping) throws Throwable { - CagraIndexParams indexParams = - new CagraIndexParams.Builder() - .withNumWriterThreads(cuvsWriterThreads) - .withIntermediateGraphDegree(intGraphDegree) - .withGraphDegree(graphDegree) - .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) - .build(); + private CagraIndexParams cagraIndexParams(int size) { + if (size < 2) { + // https://github.com/rapidsai/cuvs/issues/666 + throw new IllegalArgumentException("cagra index must be greater than 2"); + } + var minIntGraphDegree = Math.min(intGraphDegree, size); + var minGraphDegree = Math.min(graphDegree, minIntGraphDegree); + // log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); + + return new CagraIndexParams.Builder() + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(minIntGraphDegree) + .withGraphDegree(minGraphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + } + + static long nanosToMillis(long nanos) { + return Duration.ofNanos(nanos).toMillis(); + } - // log.info("Indexing started: " + System.currentTimeMillis()); - cagraIndex = + private void writeCagraIndex(OutputStream os, float[][] vectors) throws Throwable { + if (vectors.length < 2) { + throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); + } + CagraIndexParams indexParams = cagraIndexParams(vectors.length); + // long startTime = System.nanoTime(); + var index = CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); - // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + - // vectors.length); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - Path tmpFile = - Files.createTempFile( - "tmpindex", "cag"); // TODO: Should we make this a file with random names? - cagraIndex.serialize(baos, tmpFile); - return baos.toByteArray(); + // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + // log.info("Cagra index created: " + elapsedMillis + "ms, documents: " + vectors.length); + + Path tmpFile = Files.createTempFile(resources.tempDirectory(), "tmpindex", "cag"); + index.serialize(os, tmpFile); } - @SuppressForbidden(reason = "A temporary java.util.File is needed for BruteForce's serialization") - private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { + private void writeBruteForceIndex(OutputStream os, float[][] vectors) throws Throwable { BruteForceIndexParams indexParams = new BruteForceIndexParams.Builder() .withNumWriterThreads(32) // TODO: Make this configurable later. .build(); - // log.info("Indexing started: " + System.currentTimeMillis()); + // long startTime = System.nanoTime(); BruteForceIndex index = BruteForceIndex.newBuilder(resources) .withIndexParams(indexParams) .withDataset(vectors) .build(); + // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + // log.info("BruteForce index created: " + elapsedMillis + "ms, documents: " + vectors.length); - // log.info("Indexing done: " + System.currentTimeMillis()); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - index.serialize(baos); - return baos.toByteArray(); + index.serialize(os); } - @SuppressForbidden(reason = "A temporary java.util.File is needed for HNSW's serialization") - private byte[] createHnswIndex(float[][] vectors) throws Throwable { - CagraIndexParams indexParams = - new CagraIndexParams.Builder() - .withNumWriterThreads(cuvsWriterThreads) - .withIntermediateGraphDegree(intGraphDegree) - .withGraphDegree(graphDegree) - .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) - .build(); + private void writeHNSWIndex(OutputStream os, float[][] vectors) throws Throwable { + if (vectors.length < 2) { + throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); + } + CagraIndexParams indexParams = cagraIndexParams(vectors.length); - // log.info("Indexing started: " + System.currentTimeMillis()); - cagraIndexForHnsw = + // long startTime = System.nanoTime(); + var index = CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); - // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + - // vectors.length); + // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + // log.info("HNSW index created: " + elapsedMillis + "ms, documents: " + vectors.length); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); Path tmpFile = Files.createTempFile("tmpindex", "hnsw"); - cagraIndexForHnsw.serializeToHNSW(baos, tmpFile); - return baos.toByteArray(); + index.serializeToHNSW(os, tmpFile); } - @SuppressWarnings({"resource", "rawtypes", "unchecked"}) @Override public void flush(int maxDoc, DocMap sortMap) throws IOException { - cuVSIndex = - this.segmentWriteState.directory.createOutput( - cuVSDataFilename, this.segmentWriteState.context); - CodecUtil.writeIndexHeader( - cuVSIndex, - CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, - CuVSVectorsFormat.VERSION_CURRENT, - this.segmentWriteState.segmentInfo.getId(), - this.segmentWriteState.segmentSuffix); - - CuVSSegmentFile cuVSFile = new CuVSSegmentFile(new SegmentOutputStream(cuVSIndex, 100000)); - - LinkedHashMap metaMap = new LinkedHashMap(); - - for (CagraFieldVectorsWriter field : fieldVectorWriters) { - // long start = System.currentTimeMillis(); - - byte[] cagraIndexBytes = null; - byte[] bruteForceIndexBytes = null; - byte[] hnswIndexBytes = null; - try { - // log.info("Starting CAGRA indexing, space remaining: " + new File("/").getFreeSpace()); - // log.info("Starting CAGRA indexing, docs: " + field.vectors.size()); - - float vectors[][] = new float[field.vectors.size()][field.vectors.get(0).length]; - for (int i = 0; i < vectors.length; i++) { - for (int j = 0; j < vectors[i].length; j++) { - vectors[i][j] = field.vectors.get(i)[j]; - } - } - - cagraIndexBytes = createCagraIndex(vectors, new ArrayList(field.vectors.keySet())); - bruteForceIndexBytes = createBruteForceIndex(vectors); - hnswIndexBytes = createHnswIndex(vectors); - } catch (Throwable e) { - throw new RuntimeException(e); + flatVectorsWriter.flush(maxDoc, sortMap); + for (var field : fields) { + if (sortMap == null) { + writeField(field); + } else { + writeSortingField(field, sortMap); } - - // start = System.currentTimeMillis(); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".cag", cagraIndexBytes); - // log.info( - // "time for writing CAGRA index bytes to zip: " + (System.currentTimeMillis() - start)); - - // start = System.currentTimeMillis(); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".bf", bruteForceIndexBytes); - /*log.info( - "time for writing BRUTEFORCE index bytes to zip: " - + (System.currentTimeMillis() - start));*/ - - // start = System.currentTimeMillis(); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".hnsw", hnswIndexBytes); - // log.info("time for writing HNSW index bytes to zip: " + (System.currentTimeMillis() - - // start)); - - // start = System.currentTimeMillis(); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".vec", - SerializationUtils.serialize(new ArrayList(field.vectors.values()))); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".map", - SerializationUtils.serialize(new ArrayList(field.vectors.keySet()))); - // log.info("list serializing and writing: " + (System.currentTimeMillis() - start)); - field.vectors.clear(); } - - metaMap.put(segmentWriteState.segmentInfo.name, maxDoc); - cuVSFile.addFile( - segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); - cuVSFile.close(); - - CodecUtil.writeFooter(cuVSIndex); } - SegmentOutputStream mergeOutputStream = null; - CuVSSegmentFile mergedIndexFile = null; - - @SuppressWarnings("resource") - @Override - public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { - List segInputStreams = new ArrayList(); - List readers = new ArrayList(); + private void writeField(CuVSFieldWriter fieldData) throws IOException { + // TODO: Argh! + float[][] vectors = fieldData.getVectors().toArray(float[][]::new); + writeFieldInternal(fieldData.fieldInfo(), vectors); + } - for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { - CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; - segInputStreams.add(reader.segmentInputStream); - readers.add(reader); - } + private void writeSortingField(CuVSFieldWriter fieldData, Sorter.DocMap sortMap) + throws IOException { + DocsWithFieldSet oldDocsWithFieldSet = fieldData.getDocsWithFieldSet(); + final int[] new2OldOrd = new int[oldDocsWithFieldSet.cardinality()]; // new ord to old ord - // log.info("Merging one field for segment: " + segmentWriteState.segmentInfo.name); - // log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + mapOldOrdToNewOrd(oldDocsWithFieldSet, sortMap, null, new2OldOrd, null); - if (!List.of(segmentWriteState.directory.listAll()).contains(cuVSDataFilename)) { - IndexOutput mergedVectorIndex = - segmentWriteState.directory.createOutput(cuVSDataFilename, segmentWriteState.context); - CodecUtil.writeIndexHeader( - mergedVectorIndex, - CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, - CuVSVectorsFormat.VERSION_CURRENT, - segmentWriteState.segmentInfo.getId(), - segmentWriteState.segmentSuffix); - this.mergeOutputStream = new SegmentOutputStream(mergedVectorIndex, 100000); - mergedIndexFile = new CuVSSegmentFile(this.mergeOutputStream); + // TODO: Argh! we need to be able to avoid loading all vectors into contiguous heap memory + float[][] oldVectors = fieldData.getVectors().toArray(float[][]::new); + float[][] newVectors = new float[oldVectors.length][]; + for (int i = 0; i < oldVectors.length; i++) { + newVectors[i] = oldVectors[new2OldOrd[i]]; } + writeFieldInternal(fieldData.fieldInfo(), newVectors); + } - // log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); - - if (mergeStrategy.equals(MergeStrategy.TRIVIAL_MERGE)) { - throw new UnsupportedOperationException(); - } else if (mergeStrategy.equals(MergeStrategy.NON_TRIVIAL_MERGE)) { - // log.info("Readers: " + segInputStreams.size() + ", deocMaps: " + - // mergeState.docMaps.length); - ArrayList docMapList = new ArrayList(); - - for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { - // CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; - // for (CuVSIndex index : reader.cuvsIndexes.get(fieldInfo.name)) { - // log.info("Mapping for segment (" + reader.fileName + "): " + index.getMapping()); - // log.info("Mapping for segment (" + reader.fileName + "): " + - // index.getMapping().size()); - for (int id = 0; id < mergeState.maxDocs[i]; id++) { - docMapList.add(mergeState.docMaps[i].get(id)); + private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws IOException { + long cagraIndexOffset, cagraIndexLength; + long bruteForceIndexOffset, bruteForceIndexLength; + long hnswIndexOffset, hnswIndexLength; + assert vectors.length > 0; + try { + // write the cagra graph + var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + cagraIndexOffset = cuvsIndex.getFilePointer(); + if (vectors.length > MIN_CAGRA_INDEX_SIZE) { + try { + writeCagraIndex(cagraIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); } - // log.info("DocMaps for segment (" + reader.fileName + "): " + docMapList); - // } } - - ArrayList mergedVectors = - Util.getMergedVectors( - segInputStreams, fieldInfo.name, segmentWriteState.segmentInfo.name); - // log.info("Final mapping: " + docMapList); - // log.info("Final mapping: " + docMapList.size()); - // log.info("Merged vectors: " + mergedVectors.size()); - LinkedHashMap metaMap = new LinkedHashMap(); - byte[] cagraIndexBytes = null; - byte[] bruteForceIndexBytes = null; - byte[] hnswIndexBytes = null; - try { - float vectors[][] = new float[mergedVectors.size()][mergedVectors.get(0).length]; - for (int i = 0; i < vectors.length; i++) { - for (int j = 0; j < vectors[i].length; j++) { - vectors[i][j] = mergedVectors.get(i)[j]; - } + cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; + + // write the brute force index + var bruteForceIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + bruteForceIndexOffset = cuvsIndex.getFilePointer(); + writeBruteForceIndex(bruteForceIndexOutputStream, vectors); + bruteForceIndexLength = cuvsIndex.getFilePointer() - bruteForceIndexOffset; + + // write the hnsw index + var hnswIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + hnswIndexOffset = cuvsIndex.getFilePointer(); + if (vectors.length > MIN_CAGRA_INDEX_SIZE) { + try { + writeHNSWIndex(hnswIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); } - cagraIndexBytes = createCagraIndex(vectors, new ArrayList()); - bruteForceIndexBytes = createBruteForceIndex(vectors); - hnswIndexBytes = createHnswIndex(vectors); - } catch (Throwable e) { - throw new RuntimeException(e); - } - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".cag", cagraIndexBytes); - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".bf", - bruteForceIndexBytes); - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".hnsw", hnswIndexBytes); - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".vec", - SerializationUtils.serialize(mergedVectors)); - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".map", - SerializationUtils.serialize(docMapList)); - metaMap.put(segmentWriteState.segmentInfo.name, mergedVectors.size()); - if (mergedIndexFile.getFilesAdded().contains(segmentWriteState.segmentInfo.name + ".meta") - == false) { - mergedIndexFile.addFile( - segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); } - // log.info("DocMaps: " + Arrays.toString(mergeState.docMaps)); - - metaMap.clear(); + hnswIndexLength = cuvsIndex.getFilePointer() - hnswIndexOffset; + + // StringBuilder sb = new StringBuilder("writeField "); + // sb.append(": fieldInfo.name=").append(fieldInfo.name); + // sb.append(", fieldInfo.number=").append(fieldInfo.number); + // sb.append(", size=").append(vectors.length); + // sb.append(", cagraIndexLength=").append(cagraIndexLength); + // sb.append(", bruteForceIndexLength=").append(bruteForceIndexLength); + // sb.append(", hnswIndexLength=").append(hnswIndexLength); + // log.info(sb.toString()); + + writeMeta( + fieldInfo, + vectors.length, + cagraIndexOffset, + cagraIndexLength, + bruteForceIndexOffset, + bruteForceIndexLength, + hnswIndexOffset, + hnswIndexLength); + } catch (Throwable t) { + handleThrowable(t); } } - @Override - public void finish() throws IOException { - if (this.mergeOutputStream != null) { - mergedIndexFile.close(); - CodecUtil.writeFooter(mergeOutputStream.out); - IOUtils.close(mergeOutputStream.out); - this.mergeOutputStream = null; - this.mergedIndexFile = null; + private void writeMeta( + FieldInfo field, + int count, + long cagraIndexOffset, + long cagraIndexLength, + long bruteForceIndexOffset, + long bruteForceIndexLength, + long hnswIndexOffset, + long hnswIndexLength) + throws IOException { + meta.writeInt(field.number); + meta.writeInt(field.getVectorEncoding().ordinal()); + meta.writeInt(distFuncToOrd(field.getVectorSimilarityFunction())); + meta.writeInt(field.getVectorDimension()); + meta.writeInt(count); + meta.writeVLong(cagraIndexOffset); + meta.writeVLong(cagraIndexLength); + meta.writeVLong(bruteForceIndexOffset); + meta.writeVLong(bruteForceIndexLength); + meta.writeVLong(hnswIndexOffset); + meta.writeVLong(hnswIndexLength); + } + + static int distFuncToOrd(VectorSimilarityFunction func) { + for (int i = 0; i < SIMILARITY_FUNCTIONS.size(); i++) { + if (SIMILARITY_FUNCTIONS.get(i).equals(func)) { + return (byte) i; + } } + throw new IllegalArgumentException("invalid distance function: " + func); } - @Override - public long ramBytesUsed() { - long total = SHALLOW_RAM_BYTES_USED; - for (var field : fieldVectorWriters) { - total += field.ramBytesUsed(); + // We currently ignore this, until cuVS supports tiered indices + private static final String CANNOT_GENERATE_CAGRA = + """ + Could not generate an intermediate CAGRA graph because the initial \ + kNN graph contains too many invalid or duplicated neighbor nodes. \ + This error can occur, for example, if too many overflows occur \ + during the norm computation between the dataset vectors\ + """; + + static void handleThrowableWithIgnore(Throwable t, String msg) throws IOException { + if (t.getMessage().contains(msg)) { + return; } - return total; + handleThrowable(t); } - /** OutputStream for writing into an IndexOutput */ - public class SegmentOutputStream extends OutputStream { + static void handleThrowable(Throwable t) throws IOException { + switch (t) { + case IOException ioe -> throw ioe; + case Error error -> throw error; + case RuntimeException re -> throw re; + case null, default -> throw new RuntimeException("UNEXPECTED: exception type", t); + } + } - IndexOutput out; - int bufferSize; - byte[] buffer; - int p; + private static DocsWithFieldSet getVectorData(FloatVectorValues floatVectorValues, float[][] dst) + throws IOException { + DocsWithFieldSet docsWithField = new DocsWithFieldSet(); + int count = 0; + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { + assert iter.index() == count; + dst[iter.index()] = floatVectorValues.vectorValue(iter.index()); + docsWithField.add(docV); + count++; + } + return docsWithField; + } - public SegmentOutputStream(IndexOutput out, int bufferSize) throws IOException { - super(); - this.out = out; - this.bufferSize = bufferSize; - this.buffer = new byte[this.bufferSize]; + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + flatVectorsWriter.mergeOneField(fieldInfo, mergeState); + try { + final FloatVectorValues mergedVectorValues = + switch (fieldInfo.getVectorEncoding()) { + case BYTE -> throw new AssertionError("bytes not supported"); + case FLOAT32 -> + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + }; + + float[][] vectors = new float[mergedVectorValues.size()][mergedVectorValues.dimension()]; + getVectorData(mergedVectorValues, vectors); + writeFieldInternal(fieldInfo, vectors); + } catch (Throwable t) { + handleThrowable(t); } + } - @Override - public void write(int b) throws IOException { - buffer[p] = (byte) b; - p += 1; - if (p == bufferSize) { - flush(); - } + @Override + public void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); } + finished = true; + flatVectorsWriter.finish(); - @Override - public void flush() throws IOException { - out.writeBytes(buffer, p); - p = 0; + if (meta != null) { + // write end of fields marker + meta.writeInt(-1); + CodecUtil.writeFooter(meta); } + if (cuvsIndex != null) { + CodecUtil.writeFooter(cuvsIndex); + } + } - @Override - public void close() throws IOException { - this.flush(); + @Override + public void close() throws IOException { + IOUtils.close(meta, cuvsIndex, flatVectorsWriter); + } + + @Override + public long ramBytesUsed() { + long total = SHALLOW_RAM_BYTES_USED; + for (var field : fields) { + total += field.ramBytesUsed(); } + return total; } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java new file mode 100644 index 000000000000..4eb8ed558f70 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.lucene.store.IndexInput; + +/** InputStream for reading from an IndexInput. */ +final class IndexInputInputStream extends InputStream { + + final IndexInput in; + long pos = 0; + final long limit; + + IndexInputInputStream(IndexInput in) { + this.in = in; + this.limit = in.length(); + } + + @Override + public int read() throws IOException { + if (pos >= limit) { + return -1; + } + pos++; + return in.readByte(); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + if (len <= 0) { + return 0; + } + if (pos >= limit) { + return -1; + } + long avail = limit - pos; + if (len > avail) { + len = (int) avail; + } + in.readBytes(b, off, len); + pos += len; + return len; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java new file mode 100644 index 000000000000..ffb2b922e4b5 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.OutputStream; +import org.apache.lucene.store.IndexOutput; + +/** OutputStream for writing into an IndexOutput */ +final class IndexOutputOutputStream extends OutputStream { + + static final int DEFAULT_BUFFER_SIZE = 8192; + + final IndexOutput out; + final int bufferSize; + final byte[] buffer; + int idx; + + IndexOutputOutputStream(IndexOutput out) { + this(out, DEFAULT_BUFFER_SIZE); + } + + IndexOutputOutputStream(IndexOutput out, int bufferSize) { + this.out = out; + this.bufferSize = bufferSize; + this.buffer = new byte[bufferSize]; + } + + @Override + public void write(int b) throws IOException { + buffer[idx] = (byte) b; + idx++; + if (idx == bufferSize) { + flush(); + } + } + + @Override + public void write(byte[] b, int offset, int length) throws IOException { + if (idx != 0) { + flush(); + } + out.writeBytes(b, offset, length); + } + + @Override + public void flush() throws IOException { + out.writeBytes(buffer, 0, idx); + idx = 0; + } + + @Override + public void close() throws IOException { + this.flush(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java deleted file mode 100644 index 8f81c8bb7f15..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.vectorsearch; - -import java.io.IOException; -import java.io.InputStream; -import org.apache.lucene.store.IndexInput; - -/** InputStream semantics for reading from an IndexInput */ -/*package-private*/ class SegmentInputStream extends InputStream { - - /** */ - private final IndexInput indexInput; - - public final long initialFilePointerPosition; - public final long limit; - public long pos = 0; - - // TODO: This input stream needs to be modified to enable buffering. - public SegmentInputStream(IndexInput indexInput, long limit, long initialFilePointerPosition) - throws IOException { - super(); - this.indexInput = indexInput; - this.initialFilePointerPosition = initialFilePointerPosition; - this.limit = limit; - - this.indexInput.seek(initialFilePointerPosition); - } - - @Override - public int read() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int read(byte[] b, int off, int len) { - try { - long avail = limit - pos; - if (pos >= limit) { - return -1; - } - if (len > avail) { - len = (int) avail; - } - if (len <= 0) { - return 0; - } - indexInput.readBytes(b, off, len); - pos += len; - return len; - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - @Override - public int read(byte[] b) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public void reset() throws IOException { - indexInput.seek(initialFilePointerPosition); - pos = 0; - } - - @Override - public long skip(long n) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public boolean markSupported() { - return true; - } - - @Override - public void mark(int readlimit) { - throw new UnsupportedOperationException(); - } - - @Override - public void close() { - // Do nothing for now. - } - - @Override - public int available() { - throw new UnsupportedOperationException(); - } -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java deleted file mode 100644 index a46db32afea9..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SerializationUtils.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.vectorsearch; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import java.io.OutputStream; -import java.io.Serializable; -import java.io.UncheckedIOException; -import java.util.Objects; - -/*package-private*/ class SerializationUtils { - - static byte[] serialize(final Serializable obj) { - final ByteArrayOutputStream baos = new ByteArrayOutputStream(64 * 1024); - serialize(obj, baos); - return baos.toByteArray(); - } - - static void serialize(final Serializable obj, final OutputStream outputStream) { - Objects.requireNonNull(outputStream); - try (ObjectOutputStream out = new ObjectOutputStream(outputStream)) { - out.writeObject(obj); - } catch (final IOException ex) { - throw new UncheckedIOException(ex); - } - } - - static T deserialize(final byte[] objectData) { - Objects.requireNonNull(objectData); - return deserialize(new ByteArrayInputStream(objectData)); - } - - static T deserialize(final InputStream inputStream) { - Objects.requireNonNull(inputStream); - try (ObjectInputStream in = new ObjectInputStream(inputStream)) { - @SuppressWarnings("unchecked") - final T obj = (T) in.readObject(); - return obj; - } catch (IOException ex) { - throw new UncheckedIOException(ex); - } catch (ClassNotFoundException ex) { - throw new AssertionError(ex); - } - } -} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java deleted file mode 100644 index ba980777b2df..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.vectorsearch; - -import java.io.ByteArrayOutputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; - -/** Some Utils used in CuVS integration */ -/*package-private*/ class Util { - - public static ByteArrayOutputStream getZipEntryBAOS( - String fileName, SegmentInputStream segInputStream) throws IOException { - segInputStream.reset(); - ZipInputStream zipInputStream = new ZipInputStream(segInputStream); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - boolean fileFound = false; - ZipEntry zipEntry; - while (zipInputStream.available() == 1 - && ((zipEntry = zipInputStream.getNextEntry()) != null)) { - if (zipEntry.getName().equals(fileName)) { - fileFound = true; - byte[] buffer = new byte[1024]; - int length; - while ((length = zipInputStream.read(buffer)) != -1) { - baos.write(buffer, 0, length); - } - } - } - if (!fileFound) throw new FileNotFoundException(); - return baos; - } - - // private static final Logger log = Logger.getLogger(Util.class.getName()); - - public static ArrayList getMergedVectors( - List segInputStreams, String fieldName, String mergedSegmentName) - throws IOException { - ZipEntry zs; - ArrayList mergedVectors = new ArrayList(); - // log.info("Getting mergedVectors..."); - for (SegmentInputStream segInputStream : segInputStreams) { - segInputStream.reset(); - ZipInputStream zipStream = new ZipInputStream(segInputStream); - while ((zs = zipStream.getNextEntry()) != null) { - // log.info("Getting mergedVectors... " + zs.getName()); - byte[] buffer = new byte[1024]; - int length; - if (zs.getName().endsWith(".vec")) { - String field = zs.getName().split("\\.")[0].split("/")[1]; - if (fieldName.equals(field)) { - ByteArrayOutputStream baosM = new ByteArrayOutputStream(); - while ((length = zipStream.read(buffer)) != -1) { - baosM.write(buffer, 0, length); - } - List m = SerializationUtils.deserialize(baosM.toByteArray()); - mergedVectors.addAll(m); - } - } - } - } - return mergedVectors; - } -} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java index 57be29050441..a20a49be6f53 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -32,6 +32,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnFloatVectorQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; @@ -128,7 +129,8 @@ public void testVectorSearch() throws IOException { log.info("Query size: " + numQueries + "x" + queries[0].length); log.info("TopK: " + topK); - Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); + // Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); + Query query = new KnnFloatVectorQuery("vector", queries[0], topK); int correct[] = new int[topK]; for (int i = 0; i < topK; i++) correct[i] = expected.get(0).get(i); diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java index ae5b2403a3e5..96f755b1b98f 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java @@ -16,9 +16,21 @@ */ package org.apache.lucene.sandbox.vectorsearch; +import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; + import java.util.List; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; import org.junit.BeforeClass; @@ -33,10 +45,87 @@ public static void beforeClass() { @Override protected Codec getCodec() { return TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); + // For convenience, to sanitize the test code, one can comment out + // the supported check and use another format, e.g. + // return TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat()); } @Override protected List supportedVectorEncodings() { return List.of(VectorEncoding.FLOAT32); } + + public void testMergeTwoSegsWithASingleDocPerSeg() throws Exception { + float[][] f = new float[][] {randomVector(384), randomVector(384)}; + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + Document doc1 = new Document(); + doc1.add(new StringField("id", "0", Field.Store.NO)); + doc1.add(new KnnFloatVectorField("f", f[0], EUCLIDEAN)); + w.addDocument(doc1); + w.commit(); + Document doc2 = new Document(); + doc2.add(new StringField("id", "1", Field.Store.NO)); + doc2.add(new KnnFloatVectorField("f", f[1], EUCLIDEAN)); + w.addDocument(doc2); + w.flush(); + w.commit(); + + // sanity - verify one doc per leaf + try (DirectoryReader reader = DirectoryReader.open(w)) { + List subReaders = reader.leaves(); + assertEquals(2, subReaders.size()); + assertEquals(1, subReaders.get(0).reader().getFloatVectorValues("f").size()); + assertEquals(1, subReaders.get(1).reader().getFloatVectorValues("f").size()); + } + + // now merge to a single segment + w.forceMerge(1); + + // verify merged content + try (DirectoryReader reader = DirectoryReader.open(w)) { + LeafReader r = getOnlyLeafReader(reader); + FloatVectorValues values = r.getFloatVectorValues("f"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f[1], values.vectorValue(1), 0.0f); + } + } + } + + // Basic test for multiple vectors fields per document + public void testTwoVectorFieldsPerDoc() throws Exception { + float[][] f1 = new float[][] {randomVector(384), randomVector(384)}; + float[][] f2 = new float[][] {randomVector(384), randomVector(384)}; + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + Document doc1 = new Document(); + doc1.add(new StringField("id", "0", Field.Store.NO)); + doc1.add(new KnnFloatVectorField("f1", f1[0], EUCLIDEAN)); + doc1.add(new KnnFloatVectorField("f2", f2[0], EUCLIDEAN)); + w.addDocument(doc1); + Document doc2 = new Document(); + doc2.add(new StringField("id", "1", Field.Store.NO)); + doc2.add(new KnnFloatVectorField("f1", f1[1], EUCLIDEAN)); + doc2.add(new KnnFloatVectorField("f2", f2[1], EUCLIDEAN)); + w.addDocument(doc2); + w.forceMerge(1); + + try (DirectoryReader reader = DirectoryReader.open(w)) { + LeafReader r = getOnlyLeafReader(reader); + FloatVectorValues values = r.getFloatVectorValues("f1"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f1[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f1[1], values.vectorValue(1), 0.0f); + + values = r.getFloatVectorValues("f2"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f2[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f2[1], values.vectorValue(1), 0.0f); + } + } + } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java new file mode 100644 index 000000000000..e2e2b7600e9d --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.util.ArrayUtil.copyOfSubArray; + +import java.io.IOException; +import java.util.Random; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestIndexOutputOutputStream extends LuceneTestCase { + + public void testBasic() throws IOException { + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + out.write(0x56); + out.write(new byte[] {0x10, 0x11, 0x12, 0x13, 0x14}); + out.close(); + } + + try (var indexIn = dir.openInput("test", IOContext.DEFAULT)) { + var in = new IndexInputInputStream(indexIn); + // assertEquals(0x56, in.read()); + byte[] ba = new byte[6]; + assertEquals(6, in.read(ba)); + assertArrayEquals(new byte[] {0x56, 0x10, 0x11, 0x12, 0x13, 0x14}, ba); + } + } + } + + public void testGetFilePointer() throws IOException { + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + out.write(0x56); + out.write(new byte[] {0x10, 0x11, 0x12}); + assertEquals(4, indexOut.getFilePointer()); + out.close(); + } + } + } + + public void testWithRandom() throws IOException { + byte[] data = new byte[Math.min(atLeast(10_000), 20_000)]; + Random random = random(); + random.nextBytes(data); + + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + int i = 0; + while (i < data.length) { + if (random.nextBoolean()) { + out.write(data[i]); + i++; + } else { + int numBytes = random.nextInt(Math.min(data.length - i, 100)); + out.write(data, i, numBytes); + i += numBytes; + } + } + out.close(); + } + + try (var indexIn = dir.openInput("test", IOContext.DEFAULT)) { + var in = new IndexInputInputStream(indexIn); + int i = 0; + while (i < data.length) { + if (random.nextBoolean()) { + int b = in.read(); + assertEquals(data[i], b); + i++; + } else { + int numBytes = random.nextInt(Math.min(data.length - i, 100)); + byte[] ba = new byte[numBytes]; + in.read(ba, 0, numBytes); + assertArrayEquals(copyOfSubArray(data, i, i + numBytes), ba); + i += numBytes; + } + } + assertEquals(-1, in.read()); + assertEquals(-1, in.read(new byte[2])); + } + } + } +} From 30206d68d78ed603594071ff3314efc3e75f4290 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 14 Feb 2025 11:35:37 +0000 Subject: [PATCH 74/88] add bug URLs --- .../lucene/sandbox/vectorsearch/CuVSVectorsFormat.java | 4 ---- .../lucene/sandbox/vectorsearch/CuVSVectorsReader.java | 3 ++- .../lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 10 +++++++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 0e839bafe792..cf5ade94d679 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -46,10 +46,6 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; public static final int DEFAULT_GRAPH_DEGREE = 64; - // The minimum number of vectors in the dataset required before - // we attempt to build a Cagra index - static final int MIN_CAGRA_INDEX_SIZE = 2; - static CuVSResources resources = cuVSResourcesOrNull(); /** The format for storing, reading, and merging raw vectors on disk. */ diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index 07b44854f7c2..e044ede4d8cb 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -312,7 +312,7 @@ static FloatToFloatFunction getScoreNormalizationFunc(VectorSimilarityFunction s return score -> (1f / (1f + score)); } - // This is a hack - replace with cuVS bugId/filter support + // This is a hack - https://github.com/rapidsai/cuvs/issues/696 static final int FILTER_OVER_SAMPLE = 10; @Override @@ -350,6 +350,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits new CagraQuery.Builder() .withTopK(topK) .withSearchParams(searchParams) + // we don't use ord to doc mapping, https://github.com/rapidsai/cuvs/issues/699 .withMapping(null) .withQueryVectors(new float[][] {target}) .build(); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 013ee0f40433..4e2df540a9c7 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -22,7 +22,6 @@ import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_EXT; import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; -import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.MIN_CAGRA_INDEX_SIZE; import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; @@ -68,6 +67,10 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { @SuppressWarnings("unused") private static final Logger log = Logger.getLogger(CuVSVectorsWriter.class.getName()); + // The minimum number of vectors in the dataset required before + // we attempt to build a Cagra index + static final int MIN_CAGRA_INDEX_SIZE = 2; + private final int cuvsWriterThreads; private final int intGraphDegree; private final int graphDegree; @@ -242,7 +245,7 @@ public void flush(int maxDoc, DocMap sortMap) throws IOException { } private void writeField(CuVSFieldWriter fieldData) throws IOException { - // TODO: Argh! + // TODO: Argh! https://github.com/rapidsai/cuvs/issues/698 float[][] vectors = fieldData.getVectors().toArray(float[][]::new); writeFieldInternal(fieldData.fieldInfo(), vectors); } @@ -254,7 +257,8 @@ private void writeSortingField(CuVSFieldWriter fieldData, Sorter.DocMap sortMap) mapOldOrdToNewOrd(oldDocsWithFieldSet, sortMap, null, new2OldOrd, null); - // TODO: Argh! we need to be able to avoid loading all vectors into contiguous heap memory + // TODO: Argh! https://github.com/rapidsai/cuvs/issues/698 + // Also will be replaced with the cuVS merge api float[][] oldVectors = fieldData.getVectors().toArray(float[][]::new); float[][] newVectors = new float[oldVectors.length][]; for (int i = 0; i < oldVectors.length; i++) { From 8e9fe16e4776d28a275718bb846ec453716d448b Mon Sep 17 00:00:00 2001 From: Vivek Narang Date: Fri, 21 Feb 2025 16:20:28 -0500 Subject: [PATCH 75/88] Make CuVSKnnFloatVectorQuery public --- .../lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java index efa4ce51e77a..2f6c636590ef 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -25,7 +25,7 @@ import org.apache.lucene.util.Bits; /** Query for CuVS */ -/*package-private*/ class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { +public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { private final int iTopK; private final int searchWidth; From 34afa24001efe8bc272e82195cfeb009de57fcdc Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Fri, 21 Feb 2025 14:59:43 +0000 Subject: [PATCH 76/88] assertion and test --- .../lucene/sandbox/vectorsearch/CuVSVectorsReader.java | 1 + .../lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index e044ede4d8cb..97c12798e6fb 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -336,6 +336,7 @@ public void search(String field, float[] target, KnnCollector knnCollector, Bits collectorTopK = knnCollector.k() * FILTER_OVER_SAMPLE; } final int topK = Math.min(collectorTopK, fieldEntry.count()); + assert topK > 0 : "Expected topK > 0, got:" + topK; Map result; if (knnCollector.k() <= 1024 && cuvsIndex.getCagraIndex() != null) { diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java index 96f755b1b98f..dbbdecf82ec9 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java @@ -125,6 +125,11 @@ public void testTwoVectorFieldsPerDoc() throws Exception { assertEquals(2, values.size()); assertArrayEquals(f2[0], values.vectorValue(0), 0.0f); assertArrayEquals(f2[1], values.vectorValue(1), 0.0f); + + // opportunistically check boundary condition - search with a 0 topK + var topDocs = r.searchNearestVectors("f1", randomVector(384), 0, null, 10); + assertEquals(0, topDocs.scoreDocs.length); + assertEquals(0, topDocs.totalHits.value()); } } } From 8ae25155e4ba940b6f8435a1429e0cf54c8b9e8e Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sat, 22 Feb 2025 20:46:24 +0000 Subject: [PATCH 77/88] plumb infoStream, and add indexType --- .../sandbox/vectorsearch/CuVSCodec.java | 5 +- .../vectorsearch/CuVSVectorsFormat.java | 15 +- .../vectorsearch/CuVSVectorsWriter.java | 140 ++++++++++++------ 3 files changed, 109 insertions(+), 51 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index f455a863a9a1..3489221908f8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -22,6 +22,7 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.IndexType; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; /** CuVS based codec for GPU based vector search */ @@ -35,7 +36,9 @@ public CuVSCodec(String name, Codec delegate) { super(name, delegate); KnnVectorsFormat format; try { - format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE); + format = + new CuVSVectorsFormat( + 1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA_AND_BRUTE_FORCE); setKnnFormat(format); } catch (LibraryException ex) { Logger log = Logger.getLogger(CuVSCodec.class.getName()); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index cf5ade94d679..705929fd86fe 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -26,6 +26,7 @@ import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.IndexType; import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; /** CuVS based KnnVectorsFormat for GPU acceleration */ @@ -45,6 +46,8 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { public static final int DEFAULT_WRITER_THREADS = 1; public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; public static final int DEFAULT_GRAPH_DEGREE = 64; + public static final MergeStrategy DEFAULT_MERGE_STRATEGY = MergeStrategy.NON_TRIVIAL_MERGE; + public static final IndexType DEFAULT_INDEX_TYPE = IndexType.CAGRA; static CuVSResources resources = cuVSResourcesOrNull(); @@ -57,23 +60,30 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { final int intGraphDegree; final int graphDegree; final MergeStrategy mergeStrategy; + final CuVSVectorsWriter.IndexType indexType; // the index type to build, when writing public CuVSVectorsFormat() { this( DEFAULT_WRITER_THREADS, DEFAULT_INTERMEDIATE_GRAPH_DEGREE, DEFAULT_GRAPH_DEGREE, - MergeStrategy.NON_TRIVIAL_MERGE); + DEFAULT_MERGE_STRATEGY, + DEFAULT_INDEX_TYPE); } public CuVSVectorsFormat( - int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) + int cuvsWriterThreads, + int intGraphDegree, + int graphDegree, + MergeStrategy mergeStrategy, + IndexType indexType) throws LibraryException { super("CuVSVectorsFormat"); this.mergeStrategy = mergeStrategy; this.cuvsWriterThreads = cuvsWriterThreads; this.intGraphDegree = intGraphDegree; this.graphDegree = graphDegree; + this.indexType = indexType; } private static CuVSResources cuVSResourcesOrNull() { @@ -112,6 +122,7 @@ public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOExceptio intGraphDegree, graphDegree, mergeStrategy, + indexType, resources, flatWriter); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 4e2df540a9c7..e7670484ed14 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -58,6 +58,7 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InfoStream; /** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ public class CuVSVectorsWriter extends KnnVectorsWriter { @@ -67,6 +68,9 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { @SuppressWarnings("unused") private static final Logger log = Logger.getLogger(CuVSVectorsWriter.class.getName()); + /** The name of the CUVS component for the info-stream * */ + public static final String CUVS_COMPONENT = "CUVS"; + // The minimum number of vectors in the dataset required before // we attempt to build a Cagra index static final int MIN_CAGRA_INDEX_SIZE = 2; @@ -75,21 +79,54 @@ public class CuVSVectorsWriter extends KnnVectorsWriter { private final int intGraphDegree; private final int graphDegree; + private final CuVSResources resources; + private final IndexType indexType; + @SuppressWarnings("unused") private final MergeStrategy mergeStrategy; - private final CuVSResources resources; - private final FlatVectorsWriter flatVectorsWriter; // for writing the raw vectors private final List fields = new ArrayList<>(); private final IndexOutput meta, cuvsIndex; + private final InfoStream infoStream; private boolean finished; /** Merge strategy used for CuVS */ public enum MergeStrategy { TRIVIAL_MERGE, NON_TRIVIAL_MERGE - }; + } + + /** The CuVS index Type. */ + public enum IndexType { + /** Builds a Cagra index. */ + CAGRA(true, false, false), + /** Builds a Brute Force index. */ + BRUTE_FORCE(false, true, false), + /** Builds an HSNW index - suitable for searching on CPU. */ + HNSW(false, false, true), + /** Builds a Cagra and a Brute Force index. */ + CAGRA_AND_BRUTE_FORCE(true, true, false); + private final boolean cagra, bruteForce, hnsw; + + IndexType(boolean cagra, boolean bruteForce, boolean hnsw) { + this.cagra = cagra; + this.bruteForce = bruteForce; + this.hnsw = hnsw; + } + + public boolean cagra() { + return cagra; + } + + public boolean bruteForce() { + return bruteForce; + } + + public boolean hnsw() { + return hnsw; + } + } public CuVSVectorsWriter( SegmentWriteState state, @@ -97,16 +134,19 @@ public CuVSVectorsWriter( int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy, + IndexType indexType, CuVSResources resources, FlatVectorsWriter flatVectorsWriter) throws IOException { super(); this.mergeStrategy = mergeStrategy; + this.indexType = indexType; this.cuvsWriterThreads = cuvsWriterThreads; this.intGraphDegree = intGraphDegree; this.graphDegree = graphDegree; this.resources = resources; this.flatVectorsWriter = flatVectorsWriter; + this.infoStream = state.infoStream; String metaFileName = IndexFileNames.segmentFileName( @@ -183,36 +223,36 @@ static long nanosToMillis(long nanos) { return Duration.ofNanos(nanos).toMillis(); } + private void info(String msg) { + if (infoStream.isEnabled(CUVS_COMPONENT)) { + infoStream.message(CUVS_COMPONENT, msg); + } + } + private void writeCagraIndex(OutputStream os, float[][] vectors) throws Throwable { if (vectors.length < 2) { throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); } - CagraIndexParams indexParams = cagraIndexParams(vectors.length); - // long startTime = System.nanoTime(); + CagraIndexParams params = cagraIndexParams(vectors.length); + long startTime = System.nanoTime(); var index = - CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); - // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); - // log.info("Cagra index created: " + elapsedMillis + "ms, documents: " + vectors.length); - + CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(params).build(); + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("Cagra index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); Path tmpFile = Files.createTempFile(resources.tempDirectory(), "tmpindex", "cag"); index.serialize(os, tmpFile); } private void writeBruteForceIndex(OutputStream os, float[][] vectors) throws Throwable { - BruteForceIndexParams indexParams = + BruteForceIndexParams params = new BruteForceIndexParams.Builder() .withNumWriterThreads(32) // TODO: Make this configurable later. .build(); - - // long startTime = System.nanoTime(); - BruteForceIndex index = - BruteForceIndex.newBuilder(resources) - .withIndexParams(indexParams) - .withDataset(vectors) - .build(); - // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); - // log.info("BruteForce index created: " + elapsedMillis + "ms, documents: " + vectors.length); - + long startTime = System.nanoTime(); + var index = + BruteForceIndex.newBuilder(resources).withIndexParams(params).withDataset(vectors).build(); + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("bf index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); index.serialize(os); } @@ -221,13 +261,11 @@ private void writeHNSWIndex(OutputStream os, float[][] vectors) throws Throwable throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); } CagraIndexParams indexParams = cagraIndexParams(vectors.length); - - // long startTime = System.nanoTime(); + long startTime = System.nanoTime(); var index = CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); - // long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); - // log.info("HNSW index created: " + elapsedMillis + "ms, documents: " + vectors.length); - + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("HNSW index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); Path tmpFile = Files.createTempFile("tmpindex", "hnsw"); index.serializeToHNSW(os, tmpFile); } @@ -268,40 +306,46 @@ private void writeSortingField(CuVSFieldWriter fieldData, Sorter.DocMap sortMap) } private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws IOException { - long cagraIndexOffset, cagraIndexLength; - long bruteForceIndexOffset, bruteForceIndexLength; - long hnswIndexOffset, hnswIndexLength; + long cagraIndexOffset, cagraIndexLength = 0L; + long bruteForceIndexOffset, bruteForceIndexLength = 0L; + long hnswIndexOffset, hnswIndexLength = 0L; assert vectors.length > 0; try { - // write the cagra graph - var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); cagraIndexOffset = cuvsIndex.getFilePointer(); - if (vectors.length > MIN_CAGRA_INDEX_SIZE) { - try { - writeCagraIndex(cagraIndexOutputStream, vectors); - } catch (Throwable t) { - handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + if (indexType.cagra()) { + if (vectors.length > MIN_CAGRA_INDEX_SIZE) { + try { + var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + writeCagraIndex(cagraIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + } + } else { + // well, no index will be written at all + assert indexType.bruteForce || indexType.hnsw(); } + cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; } - cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; - // write the brute force index - var bruteForceIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); bruteForceIndexOffset = cuvsIndex.getFilePointer(); - writeBruteForceIndex(bruteForceIndexOutputStream, vectors); - bruteForceIndexLength = cuvsIndex.getFilePointer() - bruteForceIndexOffset; + if (indexType.bruteForce()) { + var bruteForceIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + writeBruteForceIndex(bruteForceIndexOutputStream, vectors); + bruteForceIndexLength = cuvsIndex.getFilePointer() - bruteForceIndexOffset; + } - // write the hnsw index - var hnswIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); hnswIndexOffset = cuvsIndex.getFilePointer(); - if (vectors.length > MIN_CAGRA_INDEX_SIZE) { - try { - writeHNSWIndex(hnswIndexOutputStream, vectors); - } catch (Throwable t) { - handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + if (indexType.hnsw()) { + var hnswIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + if (vectors.length > MIN_CAGRA_INDEX_SIZE) { + try { + writeHNSWIndex(hnswIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + } } + hnswIndexLength = cuvsIndex.getFilePointer() - hnswIndexOffset; } - hnswIndexLength = cuvsIndex.getFilePointer() - hnswIndexOffset; // StringBuilder sb = new StringBuilder("writeField "); // sb.append(": fieldInfo.name=").append(fieldInfo.name); From e04c2e7f5aa3b0015f4055e2963563dbdbe0573c Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Sat, 22 Feb 2025 20:59:05 +0000 Subject: [PATCH 78/88] fix default index TYPE --- .../java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index 3489221908f8..ac94fffaf504 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -38,7 +38,7 @@ public CuVSCodec(String name, Codec delegate) { try { format = new CuVSVectorsFormat( - 1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA_AND_BRUTE_FORCE); + 1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA); setKnnFormat(format); } catch (LibraryException ex) { Logger log = Logger.getLogger(CuVSCodec.class.getName()); From fbb04070f8e9c4bd9a89f292f58b041cb657b104 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 09:22:53 +0000 Subject: [PATCH 79/88] fix workaround for tiny Cagra index --- .../vectorsearch/CuVSVectorsWriter.java | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index e7670484ed14..a9d1bb4d8dda 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -310,19 +310,21 @@ private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws I long bruteForceIndexOffset, bruteForceIndexLength = 0L; long hnswIndexOffset, hnswIndexLength = 0L; assert vectors.length > 0; + + // workaround for the minimum number of vectors for Cagra + final IndexType indexType = + this.indexType.cagra() && vectors.length < MIN_CAGRA_INDEX_SIZE + ? IndexType.BRUTE_FORCE + : this.indexType; + try { cagraIndexOffset = cuvsIndex.getFilePointer(); if (indexType.cagra()) { - if (vectors.length > MIN_CAGRA_INDEX_SIZE) { - try { - var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); - writeCagraIndex(cagraIndexOutputStream, vectors); - } catch (Throwable t) { - handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); - } - } else { - // well, no index will be written at all - assert indexType.bruteForce || indexType.hnsw(); + try { + var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + writeCagraIndex(cagraIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); } cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; } From 7f39c0c612210fb747f1cb9502e1505861c9d09d Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 09:23:09 +0000 Subject: [PATCH 80/88] tidy --- .../org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java index ac94fffaf504..c3ddc809c4d3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -36,9 +36,7 @@ public CuVSCodec(String name, Codec delegate) { super(name, delegate); KnnVectorsFormat format; try { - format = - new CuVSVectorsFormat( - 1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA); + format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA); setKnnFormat(format); } catch (LibraryException ex) { Logger log = Logger.getLogger(CuVSCodec.class.getName()); From 67ec96beffa10e04298b729d35676dd2f50ca185 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 11:42:13 +0000 Subject: [PATCH 81/88] fix bug where docs are deleted or empty --- .../vectorsearch/CuVSVectorsWriter.java | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index a9d1bb4d8dda..d374fa83ec2c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -57,6 +57,7 @@ import org.apache.lucene.index.Sorter.DocMap; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; @@ -306,13 +307,16 @@ private void writeSortingField(CuVSFieldWriter fieldData, Sorter.DocMap sortMap) } private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws IOException { + if (vectors.length == 0) { + writeEmpty(fieldInfo); + return; + } long cagraIndexOffset, cagraIndexLength = 0L; long bruteForceIndexOffset, bruteForceIndexLength = 0L; long hnswIndexOffset, hnswIndexLength = 0L; - assert vectors.length > 0; // workaround for the minimum number of vectors for Cagra - final IndexType indexType = + IndexType indexType = this.indexType.cagra() && vectors.length < MIN_CAGRA_INDEX_SIZE ? IndexType.BRUTE_FORCE : this.indexType; @@ -325,6 +329,8 @@ private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws I writeCagraIndex(cagraIndexOutputStream, vectors); } catch (Throwable t) { handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + // workaround for cuVS issue + indexType = IndexType.BRUTE_FORCE; } cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; } @@ -372,6 +378,10 @@ private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws I } } + private void writeEmpty(FieldInfo fieldInfo) throws IOException { + writeMeta(fieldInfo, 0, 0L, 0L, 0L, 0L, 0L, 0L); + } + private void writeMeta( FieldInfo field, int count, @@ -429,7 +439,8 @@ static void handleThrowable(Throwable t) throws IOException { } } - private static DocsWithFieldSet getVectorData(FloatVectorValues floatVectorValues, float[][] dst) + /** Copies the vector values into dst. Returns the actual number of vectors copied. */ + private static int getVectorData(FloatVectorValues floatVectorValues, float[][] dst) throws IOException { DocsWithFieldSet docsWithField = new DocsWithFieldSet(); int count = 0; @@ -440,7 +451,7 @@ private static DocsWithFieldSet getVectorData(FloatVectorValues floatVectorValue docsWithField.add(docV); count++; } - return docsWithField; + return docsWithField.cardinality(); } @Override @@ -455,7 +466,10 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE }; float[][] vectors = new float[mergedVectorValues.size()][mergedVectorValues.dimension()]; - getVectorData(mergedVectorValues, vectors); + int ret = getVectorData(mergedVectorValues, vectors); + if (ret < vectors.length) { + vectors = ArrayUtil.copyOfSubArray(vectors, 0, ret); + } writeFieldInternal(fieldInfo, vectors); } catch (Throwable t) { handleThrowable(t); From 6e86c21a26c5697035897f43afed5f0ae531eb7c Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 11:56:55 +0000 Subject: [PATCH 82/88] clamp intermediate graph degree --- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index d374fa83ec2c..f32c8cbfe05f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -208,9 +208,9 @@ private CagraIndexParams cagraIndexParams(int size) { // https://github.com/rapidsai/cuvs/issues/666 throw new IllegalArgumentException("cagra index must be greater than 2"); } - var minIntGraphDegree = Math.min(intGraphDegree, size); + var minIntGraphDegree = Math.min(intGraphDegree, size - 1); var minGraphDegree = Math.min(graphDegree, minIntGraphDegree); - // log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); + log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); return new CagraIndexParams.Builder() .withNumWriterThreads(cuvsWriterThreads) From b1a84c23105b3fb2e70e83c4ae5bd39e5015e227 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 11:58:22 +0000 Subject: [PATCH 83/88] comment out log mesg --- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index f32c8cbfe05f..90b1ec0ff6a1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -210,7 +210,7 @@ private CagraIndexParams cagraIndexParams(int size) { } var minIntGraphDegree = Math.min(intGraphDegree, size - 1); var minGraphDegree = Math.min(graphDegree, minIntGraphDegree); - log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); + // log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); return new CagraIndexParams.Builder() .withNumWriterThreads(cuvsWriterThreads) From 4dd1f88174609f91877f95c99835f18b60aba1b6 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 13:55:32 +0000 Subject: [PATCH 84/88] make 32 the default GPU index threads --- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 705929fd86fe..7b66ec9ad528 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -43,7 +43,7 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; - public static final int DEFAULT_WRITER_THREADS = 1; + public static final int DEFAULT_WRITER_THREADS = 32; public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; public static final int DEFAULT_GRAPH_DEGREE = 64; public static final MergeStrategy DEFAULT_MERGE_STRATEGY = MergeStrategy.NON_TRIVIAL_MERGE; From c4b5c293254184d23ecd30fd7f7c42310a704031 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Tue, 25 Feb 2025 13:56:10 +0000 Subject: [PATCH 85/88] remove LibraryException from the API, so consumers don't need cuvs-java --- .../sandbox/vectorsearch/CuVSVectorsFormat.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java index 7b66ec9ad528..e0d4678aa5fe 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -62,6 +62,11 @@ public class CuVSVectorsFormat extends KnnVectorsFormat { final MergeStrategy mergeStrategy; final CuVSVectorsWriter.IndexType indexType; // the index type to build, when writing + /** + * Creates a CuVSVectorsFormat, with default values. + * + * @throws LibraryException if the native library fails to load + */ public CuVSVectorsFormat() { this( DEFAULT_WRITER_THREADS, @@ -71,13 +76,17 @@ public CuVSVectorsFormat() { DEFAULT_INDEX_TYPE); } + /** + * Creates a CuVSVectorsFormat, with the given threads, graph degree, etc. + * + * @throws LibraryException if the native library fails to load + */ public CuVSVectorsFormat( int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy, - IndexType indexType) - throws LibraryException { + IndexType indexType) { super("CuVSVectorsFormat"); this.mergeStrategy = mergeStrategy; this.cuvsWriterThreads = cuvsWriterThreads; From 8cf5087379e66bb5f327d4f38746b94c7d0071d0 Mon Sep 17 00:00:00 2001 From: Vivek Narang Date: Wed, 26 Feb 2025 15:27:37 -0500 Subject: [PATCH 86/88] De-allocate indexes once serialized. --- .../apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index 90b1ec0ff6a1..d72d0bb2430a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -242,6 +242,7 @@ private void writeCagraIndex(OutputStream os, float[][] vectors) throws Throwabl info("Cagra index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); Path tmpFile = Files.createTempFile(resources.tempDirectory(), "tmpindex", "cag"); index.serialize(os, tmpFile); + index.destroyIndex(); } private void writeBruteForceIndex(OutputStream os, float[][] vectors) throws Throwable { @@ -255,6 +256,7 @@ private void writeBruteForceIndex(OutputStream os, float[][] vectors) throws Thr long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); info("bf index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); index.serialize(os); + index.destroyIndex(); } private void writeHNSWIndex(OutputStream os, float[][] vectors) throws Throwable { @@ -269,6 +271,7 @@ private void writeHNSWIndex(OutputStream os, float[][] vectors) throws Throwable info("HNSW index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); Path tmpFile = Files.createTempFile("tmpindex", "hnsw"); index.serializeToHNSW(os, tmpFile); + index.destroyIndex(); } @Override From 3837a109bf268f9d06a70196fc4511f76e4e96d2 Mon Sep 17 00:00:00 2001 From: ChrisHegarty Date: Thu, 27 Feb 2025 12:18:32 +0000 Subject: [PATCH 87/88] de-allocate indices on the read size, when closed --- .../sandbox/vectorsearch/CuVSIndex.java | 41 ++++++++++++++++++- .../vectorsearch/CuVSVectorsReader.java | 13 +++++- .../vectorsearch/CuVSVectorsWriter.java | 10 +---- 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java index 0356d53780d1..d0cfe86d708e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -16,13 +16,17 @@ */ package org.apache.lucene.sandbox.vectorsearch; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsReader.handleThrowable; + import com.nvidia.cuvs.BruteForceIndex; import com.nvidia.cuvs.CagraIndex; import com.nvidia.cuvs.HnswIndex; +import java.io.Closeable; +import java.io.IOException; import java.util.Objects; /** This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) */ -public class CuVSIndex { +public class CuVSIndex implements Closeable { private final CagraIndex cagraIndex; private final BruteForceIndex bruteforceIndex; private final HnswIndex hnswIndex; @@ -30,6 +34,7 @@ public class CuVSIndex { private int maxDocs; private String fieldName; private String segmentName; + private volatile boolean closed; public CuVSIndex( String segmentName, @@ -55,14 +60,17 @@ public CuVSIndex(CagraIndex cagraIndex, BruteForceIndex bruteforceIndex, HnswInd } public CagraIndex getCagraIndex() { + ensureOpen(); return cagraIndex; } public BruteForceIndex getBruteforceIndex() { + ensureOpen(); return bruteforceIndex; } public HnswIndex getHNSWIndex() { + ensureOpen(); return hnswIndex; } @@ -77,4 +85,35 @@ public String getSegmentName() { public int getMaxDocs() { return maxDocs; } + + private void ensureOpen() { + if (closed) { + throw new IllegalStateException("index is closed"); + } + } + + @Override + public void close() throws IOException { + if (closed) { + return; + } + closed = true; + destroyIndices(); + } + + private void destroyIndices() throws IOException { + try { + if (cagraIndex != null) { + cagraIndex.destroyIndex(); + } + if (bruteforceIndex != null) { + bruteforceIndex.destroyIndex(); + } + if (hnswIndex != null) { + hnswIndex.destroyIndex(); + } + } catch (Throwable t) { + handleThrowable(t); + } + } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java index 97c12798e6fb..cfb59121e36e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -32,9 +32,12 @@ import com.nvidia.cuvs.HnswIndex; import com.nvidia.cuvs.HnswIndexParams; import java.io.IOException; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.logging.Logger; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; @@ -276,7 +279,15 @@ private CuVSIndex loadCuVSIndex(FieldEntry fieldEntry) throws IOException { @Override public void close() throws IOException { - IOUtils.close(flatVectorsReader, cuvsIndexInput); + var closeableStream = + Stream.concat( + Stream.of(flatVectorsReader, cuvsIndexInput), + stream(cuvsIndices.values().iterator()).map(cursor -> cursor.value)); + IOUtils.close(closeableStream::iterator); + } + + static Stream stream(Iterator iterator) { + return StreamSupport.stream(((Iterable) () -> iterator).spliterator(), false); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java index d72d0bb2430a..61f77ee26e7c 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -23,6 +23,7 @@ import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsReader.handleThrowable; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; @@ -433,15 +434,6 @@ static void handleThrowableWithIgnore(Throwable t, String msg) throws IOExceptio handleThrowable(t); } - static void handleThrowable(Throwable t) throws IOException { - switch (t) { - case IOException ioe -> throw ioe; - case Error error -> throw error; - case RuntimeException re -> throw re; - case null, default -> throw new RuntimeException("UNEXPECTED: exception type", t); - } - } - /** Copies the vector values into dst. Returns the actual number of vectors copied. */ private static int getVectorData(FloatVectorValues floatVectorValues, float[][] dst) throws IOException { From e4e1b15cf4992047c78ed79e074d7dfb06f762b8 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Mon, 10 Mar 2025 20:46:52 +0530 Subject: [PATCH 88/88] Fixing scoring normalization for search spanning multiple segments --- .../lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java index 23d524cef182..caf9566064e9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -71,7 +71,7 @@ public int k() { @Override @SuppressWarnings("cast") public boolean collect(int docId, float similarity) { - scoreDocs.add(new ScoreDoc(docId, 1f / (float) (similarity))); + scoreDocs.add(new ScoreDoc(docId, similarity)); return true; }