naver · carlos-lassance · Mar 25, 2024 · Mar 25, 2024
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,8 @@
 *.pkl
 model_ckpt/
 ckpt/
-output/
+output/
+data/beir
+models/
+outputs/
+SPLADE.egg-info
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,12 @@
+[submodule "two_step/two_step_pisa"]
+	path = two_step/two_step_pisa
+	url = https://github.com/carlos-lassance/two_step_pisa.git
+[submodule "two_step/faster-graph-bisection"]
+	path = two_step/faster-graph-bisection
+	url = https://github.com/mpetri/faster-graph-bisection.git
+[submodule "two_step/pisa-ciff"]
+	path = two_step/pisa-ciff
+	url = https://github.com/pisa-engine/ciff.git
+[submodule "two_step/ciff"]
+	path = two_step/ciff
+	url = https://github.com/carlos-lassance/ciff.git
diff --git a/README.md b/README.md
@@ -151,18 +151,18 @@ with a different batch size, corresponding lambdas for regularization might need
 ### Evaluating a pre-trained model
 
 Indexing (and retrieval) can be done either using our (numba-based) implementation of inverted index,
-or [Anserini](https://github.com/castorini/anserini). Let's perform these steps using an available model (`naver/splade-cocondenser-ensembledistil`).
+or [Anserini](https://github.com/castorini/anserini). Let's perform these steps using an available model (`naver/splade-v3`).
 
 ```bash
 conda activate splade_env
 export PYTHONPATH=$PYTHONPATH:$(pwd)
 export SPLADE_CONFIG_NAME="config_splade++_cocondenser_ensembledistil"
 python3 -m splade.index \
-  init_dict.model_type_or_dir=naver/splade-cocondenser-ensembledistil \
+  init_dict.model_type_or_dir=naver/splade-v3 \
   config.pretrained_no_yamlconfig=true \
   config.index_dir=experiments/pre-trained/index
 python3 -m splade.retrieve \
-  init_dict.model_type_or_dir=naver/splade-cocondenser-ensembledistil \
+  init_dict.model_type_or_dir=naver/splade-v3 \
   config.pretrained_no_yamlconfig=true \
   config.index_dir=experiments/pre-trained/index \
   config.out_dir=experiments/pre-trained/out
@@ -175,7 +175,7 @@ You can similarly build the files that will be ingested by Anserini:
 
 ```bash
 python3 -m splade.create_anserini \
-  init_dict.model_type_or_dir=naver/splade-cocondenser-ensembledistil \
+  init_dict.model_type_or_dir=naver/splade-v3 \
   config.pretrained_no_yamlconfig=true \
   config.index_dir=experiments/pre-trained/index \
   +quantization_factor_document=100 \
@@ -207,6 +207,11 @@ done
 
 We provide in `efficient_splade_pisa/README.md` the steps to evaluate efficient SPLADE models with PISA.
 
+### Two-Step evaluation
+
+We provide in `two-step/README.md` the steps to evaluate Two-Step SPLADE with PISA.
+
+
 ***
 
 # Cite :scroll:

diff --git a/conf/config_splade_two_step.yaml b/conf/config_splade_two_step.yaml
@@ -0,0 +1,28 @@
+# @package _global_
+
+# FILES
+defaults: # (these specify which config FILES to use)
+  ############## INDEX ###################################
+  - index: msmarco
+  ############## RETRIEVE ################################
+  - retrieve_evaluate: all
+  ############### FLOPS ##################################
+  - flops: msmarco
+
+# Direct PARAMETER setting
+config:
+  checkpoint_dir: models/hf/v3/checkpoint
+  index_dir:  models/hf/v3/index
+  out_dir:  two_step/
+  pretrained_no_yamlconfig: true
+  tokenizer_type: naver/splade-v3
+  index_batch_size: 500
+  eval_batch_size: 500
+  matching_type: splade
+  max_length: 256
+  index_retrieve_batch_size: 400
+init_dict:
+  model_type_or_dir: naver/splade-v3
+  freeze_d_model: 0
+  agg: max
+  fp16: true
diff --git a/conf/config_splade_v3_hf.yaml b/conf/config_splade_v3_hf.yaml
@@ -0,0 +1,28 @@
+# @package _global_
+
+# FILES
+defaults: # (these specify which config FILES to use)
+  ############## INDEX ###################################
+  - index: msmarco
+  ############## RETRIEVE ################################
+  - retrieve_evaluate: all
+  ############### FLOPS ##################################
+  - flops: msmarco
+
+# Direct PARAMETER setting
+config:
+  checkpoint_dir: models/hf/v3/checkpoint
+  index_dir:  models/hf/v3/index
+  out_dir:  models/hf/v3/out
+  pretrained_no_yamlconfig: true
+  tokenizer_type: naver/splade-v3
+  index_batch_size: 500
+  eval_batch_size: 500
+  matching_type: splade
+  max_length: 256
+  index_retrieve_batch_size: 400
+init_dict:
+  model_type_or_dir: naver/splade-v3
+  freeze_d_model: 0
+  agg: max
+  fp16: true
diff --git a/splade/create_anserini.py b/splade/create_anserini.py
@@ -2,7 +2,7 @@
 from omegaconf import DictConfig
 
 from conf.CONFIG_CHOICE import CONFIG_NAME, CONFIG_PATH
-from .datasets.dataloaders import TextCollectionDataLoader
+from .datasets.dataloaders import AnseriniCollectionDataLoader
 from .datasets.datasets import CollectionDatasetPreLoad
 from .models.models_utils import get_model
 from .tasks.transformer_evaluator import EncodeAnserini
@@ -24,14 +24,14 @@ def index(exp_dict: DictConfig):
     else:
         raise NotImplementedError
     d_collection = CollectionDatasetPreLoad(data_dir=exp_dict["data"]["COLLECTION_PATH"], id_style="row_id")
-    d_loader = TextCollectionDataLoader(dataset=d_collection, tokenizer_type=model_training_config["tokenizer_type"],
+    d_loader = AnseriniCollectionDataLoader(dataset=d_collection, tokenizer_type=model_training_config["tokenizer_type"],
                                         max_length=model_training_config["max_length"],
                                         batch_size=config["index_retrieve_batch_size"],
                                         shuffle=False, num_workers=4)
     evaluator = EncodeAnserini(model, config)
     evaluator.index(d_loader, quantization_factor=quantization_factor_doc)
     q_collection = CollectionDatasetPreLoad(data_dir=exp_dict["data"]["Q_COLLECTION_PATH"][0], id_style="row_id")
-    q_loader = TextCollectionDataLoader(dataset=q_collection, tokenizer_type=model_training_config["tokenizer_type"],
+    q_loader = AnseriniCollectionDataLoader(dataset=q_collection, tokenizer_type=model_training_config["tokenizer_type"],
                                         max_length=model_training_config["max_length"],
                                         batch_size=config["index_retrieve_batch_size"],
                                         shuffle=False, num_workers=4)

diff --git a/splade/create_anserini_beir.py b/splade/create_anserini_beir.py
@@ -0,0 +1,66 @@
+import hydra
+from omegaconf import DictConfig
+
+from conf.CONFIG_CHOICE import CONFIG_NAME, CONFIG_PATH
+from .datasets.dataloaders import AnseriniCollectionDataLoader
+from .datasets.datasets import CollectionDatasetPreLoad
+from .models.models_utils import get_model
+from .tasks.transformer_evaluator import EncodeAnserini
+from .utils.utils import get_initialize_config
+from beir import util, LoggingHandler
+from beir.datasets.data_loader import GenericDataLoader
+from beir.retrieval.evaluation import EvaluateRetrieval
+from .datasets.datasets import BeirDatasetAnserini
+import os
+
+@hydra.main(config_path=CONFIG_PATH, config_name=CONFIG_NAME)
+def index(exp_dict: DictConfig):
+    exp_dict, config, init_dict, model_training_config = get_initialize_config(exp_dict)
+
+    model = get_model(config, init_dict)
+
+    if model_training_config["matching_type"] == "splade":
+        quantization_factor_doc = exp_dict["quantization_factor_document"]
+        quantization_factor_query = exp_dict["quantization_factor_query"]
+    elif model_training_config["matching_type"] == "splade_doc":
+        quantization_factor_doc = exp_dict["quantization_factor_document"]
+        quantization_factor_query = 1
+    else:
+        raise NotImplementedError
+
+    # Download and unzip the dataset
+    url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(
+        exp_dict["beir"]["dataset"])
+    out_dir = exp_dict["beir"]["dataset_path"]
+    data_path = util.download_and_unzip(url, out_dir)
+
+    config["index_dir"] = os.path.join(config["index_dir"], "beir", exp_dict["beir"]["dataset"])
+    os.makedirs(config["index_dir"], exist_ok=True)
+
+    out_dir_2 = os.path.join(config["out_dir"], "beir", exp_dict["beir"]["dataset"])
+    config["out_dir"] = os.path.join(config["out_dir"], "beir", exp_dict["beir"]["dataset"],"docs")
+    os.makedirs(config["out_dir"], exist_ok=True)
+
+    corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split=exp_dict["beir"].get("split","test"))
+
+    d_collection = BeirDatasetAnserini(corpus, information_type="document")
+    q_collection = BeirDatasetAnserini(queries, information_type="query")
+
+
+    d_loader = AnseriniCollectionDataLoader(dataset=d_collection, tokenizer_type=model_training_config["tokenizer_type"],
+                                        max_length=model_training_config["max_length"],
+                                        batch_size=config["index_retrieve_batch_size"],
+                                        shuffle=False, num_workers=4)
+    evaluator = EncodeAnserini(model, config)
+    evaluator.index(d_loader, quantization_factor=quantization_factor_doc)
+    config["out_dir"] = out_dir_2
+    q_loader = AnseriniCollectionDataLoader(dataset=q_collection, tokenizer_type=model_training_config["tokenizer_type"],
+                                        max_length=model_training_config["max_length"],
+                                        batch_size=config["index_retrieve_batch_size"],
+                                        shuffle=False, num_workers=4)
+    evaluator = EncodeAnserini(model, config, input_type="query")
+    evaluator.index(q_loader, quantization_factor=quantization_factor_query)
+
+
+if __name__ == "__main__":
+    index()
diff --git a/splade/datasets/dataloaders.py b/splade/datasets/dataloaders.py
@@ -121,6 +121,26 @@ def collate_fn(self, batch):
                 "text": d
                 }
 
+class AnseriniCollectionDataLoader(DataLoaderWrapper):
+    """same but also return the input text
+    """
+
+    def collate_fn(self, batch):
+        """
+        batch is a list of tuples, each tuple has 2 (text) items (id_, doc)
+        """
+        id_, d = zip(*batch)
+        processed_passage = self.tokenizer(list(d),
+                                           add_special_tokens=True,
+                                           padding="longest",  # pad to max sequence length in batch
+                                           truncation="longest_first",  # truncates to max model length,
+                                           max_length=self.max_length,
+                                           return_attention_mask=True)
+        return {**{k: torch.tensor(v) for k, v in processed_passage.items()},
+                "id": id_,
+                "text": d
+                }
+
 
 class EvalDataLoader(DataLoaderWrapper):
     """canonical encoding (query and document concatenated)

diff --git a/splade/datasets/datasets.py b/splade/datasets/datasets.py
@@ -121,6 +121,29 @@ def __getitem__(self, idx):
         true_idx = self.idx_to_key[idx]
         return idx, self.value_dictionary[true_idx]
 
+class BeirDatasetAnserini(Dataset):
+    """
+    dataset to iterate over a BEIR collection
+    we preload everything in memory at init
+    """
+
+    def __init__(self, value_dictionary, information_type="document"):
+        assert information_type in ["document", "query"]
+        self.value_dictionary = value_dictionary
+        self.information_type = information_type
+        if self.information_type == "document":
+            self.value_dictionary = dict()
+            for key, value in value_dictionary.items():
+                self.value_dictionary[key] = value["title"] + " " + value["text"]
+        self.idx_to_key = {idx: key for idx, key in enumerate(self.value_dictionary)}
+
+    def __len__(self):
+        return len(self.value_dictionary)
+
+    def __getitem__(self, idx):
+        true_idx = self.idx_to_key[idx]
+        return true_idx, self.value_dictionary[true_idx]
+
 
 class MsMarcoHardNegatives(Dataset):
     """

diff --git a/splade/tasks/transformer_evaluator.py b/splade/tasks/transformer_evaluator.py
@@ -222,7 +222,7 @@ def index(self, collection_loader, quantization_factor=2):
                     inputs[k] = v.to(self.device)
                 batch_rep = self.model(**{self.arg_key: inputs})[self.output_key].cpu().numpy()
                 for rep, id_, text in zip(batch_rep, batch["id"], batch["text"]):
-                    id_ = id_.item()
+                    id_ = id_
                     idx = np.nonzero(rep)
                     # then extract values:
                     data = rep[idx]

diff --git a/two_step/.gitignore b/two_step/.gitignore
@@ -0,0 +1,7 @@
+anserini_indexes/
+beir/
+results/
+runs/
+ciff_output/
+pisa-canonical/
+pisa-index/
diff --git a/two_step/README.md b/two_step/README.md
@@ -0,0 +1,44 @@
+# two-step-splade
+
+This is the code for the Two-Step SPLADE paper (https://link.springer.com/chapter/10.1007/978-3-031-56060-6_23). There is two ways of using this code: Either you can replicate our results using our precomputed indexes or you can reproduce it by generating the indexes.
+
+## Replicating Results
+
+1. Download all index and query files from: `wget -O two_step.tar.gz "https://www.dropbox.com/scl/fi/gjl9x8wg08bdmkic0x7xa/two_step.tar.gz?rlkey=x03evbuvnamqml6v68m5scf5x&dl=1"`
+2. Untar `tar -zxvf two_step.tar.gz`
+3. Install pisa (from our folder)
+4. Run the desired table line (method #) for each dataset (example `bash run_method_b.sh $dataset`)
+5. Aggregate results
+
+## Reproduce
+
+1. Install all submodules (including our pisa)
+2. Download anserini fatjar `wget https://repo1.maven.org/maven2/io/anserini/anserini/0.24.1/anserini-0.24.1-fatjar.jar`
+3. Generate anserini files with `bash run_splade.sh $dataset`
+4. Split and gzip files with `bash split.sh $dataset`
+5. Count doc tokens with `python token_count.py $dataset`
+6. Count query tokens with `python token_count_query.py $dataset`
+7. Prune `python prune.py $dataset`
+8. Index files with `bash index.sh $dataset`
+  8. Test anserini indexes `bash retrieve.sh $dataset`
+9. Convert to pisa with `bash convert_pisa.sh $dataset`
+10. Reorder full index with `bash reorder.sh $dataset`
+11. Run the desired table line (method #) for each dataset (example `bash run_method_b.sh $dataset`)
+12. Aggregate results
+
+## Issues or problems
+
+Feel free to create a new issue or to send me an email directly (cadurosar@gmail.com)
+
+## Cite
+
+```
+@inproceedings{lassance2024two,
+  title={Two-Step SPLADE: Simple, Efficient and Effective Approximation of SPLADE},
+  author={Lassance, Carlos and Dejean, Herv{\'e} and Clinchant, St{\'e}phane and Tonellotto, Nicola},
+  booktitle={European Conference on Information Retrieval},
+  pages={349--363},
+  year={2024},
+  organization={Springer}
+}
+```
diff --git a/two_step/ciff b/two_step/ciff