castorini · SeanSong25 · Jun 14, 2024 · Jun 14, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,8 @@ collections/*
 indexes/*
 .vscode/
 venv/
+*.txt
+*.duckdb
 
 # build directories from `python3 setup.py sdist bdist_wheel`
 build/
@@ -19,3 +21,9 @@ runs/
 
 # logs should also be ignored
 logs/
+
+# binaries should also be ignored
+bin/*
+lib*
+pyvenv*
+share*
diff --git a/collections/.gitkeep b/collections/.gitkeep
diff --git a/duckdb_in_memory.py b/duckdb_in_memory.py
@@ -0,0 +1,19 @@
+import duckdb
+import faiss_vector_extractor
+
+# Open the file-based DuckDB database
+file_con = duckdb.connect('my_database.duckdb')
+
+# Create an in-memory DuckDB database
+mem_con = duckdb.connect(database=':memory:')
+
+# Extract data from the file-based msmarco table into a Pandas DataFrame
+df = file_con.execute("SELECT * FROM msmarco").fetchdf()
+
+# Register the DataFrame in the in-memory DuckDB database
+mem_con.register('msmarco', df)
+
+# Now you can create the HNSW index on the msmarco table in the in-memory database
+mem_con.execute(f"CREATE INDEX hnsw_idx ON msmarco USING HNSW(vector) WITH (metric = 'ip')")
+
+# Continue with your operations...
diff --git a/duckdb_server.py b/duckdb_server.py
@@ -0,0 +1,17 @@
+import duckdb
+from flask import Flask, request, jsonify
+
+app = Flask(__name__)
+con = duckdb.connect('my_database.duckdb')
+
+@app.route('/query', methods=['POST'])
+def query_duckdb():
+    query = request.json.get('query')
+    try:
+        result = con.execute(query).fetchdf()
+        return result.to_json(orient='split')
+    except Exception as e:
+        return jsonify({'error': str(e)}), 400
+
+if __name__ == '__main__':
+    app.run(port=5000)
diff --git a/instructions.md b/instructions.md
@@ -0,0 +1,40 @@
+# Encoding and Benchmarking Process
+
+## 1. Encode the Corpus
+Create a directory for document embeddings and encode the corpus using the specified encoder.
+
+```bash
+mkdir indexes/non-faiss-nfcorpus/documents
+python -m pyserini.encode \
+  input   --corpus collections/nfcorpus/corpus.jsonl \
+          --fields title text \
+  output  --embeddings indexes/non-faiss-nfcorpus/documents \
+  encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \
+          --device cpu \
+          --pooling mean \
+          --fields title text \
+          --batch 32
+```
+
+## 2. Encode the Queries
+Create a directory for query embeddings and encode the queries using the specified encoder.
+
+```bash
+mkdir indexes/non-faiss-nfcorpus/queries
+python -m pyserini.encode \
+  input   --corpus collections/nfcorpus/queries.jsonl \
+          --fields title text \
+  output  --embeddings indexes/non-faiss-nfcorpus/queries \
+  encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \
+          --device cpu \
+          --pooling mean \
+          --fields title text \
+          --batch 32
+```
+
+## 3. Run Benchmarks
+
+```bash
+python3 benchmark_duckdb.py 
+python3 benchmark_pgvector.py
+
diff --git a/msmarco_benchmark.py b/msmarco_benchmark.py
@@ -0,0 +1,27 @@
+import argparse
+import faiss
+import faiss_vector_extractor
+
+TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = "/store/scratch/x59song/trec_dot_product_output.txt"
+
+def run_benchmark(trec_output_file_path, metric, query_index_path, adaptor):
+    query_vector_map = load_index_and_docids(query_index_path)
+    adaptor.run_benchmark(query_vector_map, table_name, metric, 20, 768, trec_output_file_path)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='FAISS Vector DB Index Constructor')
+    parser.add_argument('--index_name', type=str, required=True, help='name of the FAISS index file')
+    parser.add_argument('--metric', type=str, required=True, help='metric of the FAISS index')
+    parser.add_argument('--table_name', type=str, required=True, help='name of the table to store the vectors')
+    args = parser.parse_args()
+
+    DBConfig = {
+        'temp_directory': '/store/scratch/x59song/temp',
+        'memory_limit': '50GB'
+    }
+
+    adaptor = DuckDBVectorDBFaissIndexAdaptor(args.index_name, DBConfig)
+    adaptor.extract_vectors_and_construct_index(args.table_name, args.metric)
+    run_benchmark(TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, args.metric, args.index_name, adaptor)
+
+
diff --git a/nfcorpus_results.txt b/nfcorpus_results.txt
@@ -0,0 +1,15 @@
+duckdb:
+building l2sq index:  0.3682847023010254 0.3575248718261719 0.35877418518066406
+building cosine index:  0.4233689308166504 0.4250659942626953 0.4125690460205078
+building ip index:  0.35698509216308594 0.326251745223999 0.33107995986938477
+l2sq:  19.746002674102783 21.720022916793823 20.766737937927246    19.952106475830078(second run)
+cosine:  22.334033727645874 22.69918704032898 22.870506525039673    22.43225622177124(second run)
+ip:  20.792579174041748 19.3823139667511 20.307250261306763    20.414534091949463(second run)
+
+pg_vector:
+building l2sq index:  2.4153892993927 2.3378589153289795 2.276991844177246
+building cosine index:  2.4951090812683105 2.369596004486084 2.459275960922241
+building ip index:  2.471719980239868 2.325632095336914 2.4149928092956543
+l2sq:  5.069890260696411 4.91141152381897 4.930738925933838     4.911103963851929(second run)
+cosine:  31.49447011947632 31.42801332473755 33.082948207855225     31.616244316101074(second run)
+ip:  28.120339155197144 27.629921197891235 30.123175144195557     29.147559881210327(second run)
diff --git a/scripts/msmarco-passage/encode_queries.py b/scripts/msmarco-passage/encode_queries.py
diff --git a/tools b/tools
diff --git a/vectordb_benchmark/benchmark_duckdb.py b/vectordb_benchmark/benchmark_duckdb.py
@@ -0,0 +1,134 @@
+import json
+import duckdb
+import numpy as np
+import subprocess
+import time
+
+# Paths to embedding, query, and output files
+DOCUMENT_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/documents/embeddings.jsonl'
+QUERY_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/queries/embeddings.jsonl'
+TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_dot_product.txt'
+TREC_COSINE_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_cosine.txt'
+TREC_L2SQ_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_l2sq.txt'
+K = 10  # Number of nearest neighbors to retrieve
+RUN_ID = "DuckDBHNSW"  # Identifier for the run
+
+def get_vector_size(jsonl_file_path):
+    """Determines the size of the vector, assuming vectors all have the same dimension."""
+    with open(jsonl_file_path, 'r') as file:
+        for line in file:
+            data = json.loads(line)
+            vector = data.get('vector', [])
+            return len(vector)
+    return 0
+
+def insert_data_into_table(con, id, content, vector, table):
+    """Inserts data into the DuckDB table."""
+    con.execute(f"INSERT INTO {table} (id, content, vector) VALUES (?, ?, ?)", (id, content, vector))
+
+def setup_database():
+    """Sets up the DuckDB database and inserts document data."""
+    con = duckdb.connect(database=':memory:')
+    con.execute("INSTALL vss")
+    con.execute("LOAD vss")
+    con.execute("PRAGMA temp_directory='/tmp/duckdb_temp'")
+    con.execute("PRAGMA memory_limit='4GB'")
+
+    vector_size = get_vector_size(DOCUMENT_JSONL_FILE_PATH)
+    print(f"Vector size: {vector_size}")
+
+    # Create documents table
+    con.execute(f"""
+        CREATE TABLE documents (
+            id STRING,
+            content STRING,
+            vector FLOAT[{vector_size}]
+        )
+    """)
+
+    # Insert data from JSONL file
+    with open(DOCUMENT_JSONL_FILE_PATH, 'r') as file:
+        for line in file:
+            data = json.loads(line)
+            insert_data_into_table(con, data['id'], data['contents'], data['vector'], 'documents')
+
+    # Create HNSW indices with different metrics
+    # print the time taken for each index building
+    start_time = time.time()
+    con.execute("CREATE INDEX l2sq_idx ON documents USING HNSW(vector) WITH (metric = 'l2sq')")
+    print('building l2sq index: ', time.time() - start_time)
+    start_time = time.time()
+    con.execute("CREATE INDEX cos_idx ON documents USING HNSW(vector) WITH (metric = 'cosine')")
+    print('building cosine index: ', time.time() - start_time)
+    start_time = time.time()
+    con.execute("CREATE INDEX ip_idx ON documents USING HNSW(vector) WITH (metric = 'ip')")
+    print('building ip index: ', time.time() - start_time)
+
+    return con
+
+def run_trec_eval(trec_output_file_path):
+    """Runs TREC evaluation and prints ndcg@10."""
+    command = [
+        "python", "-m", "pyserini.eval.trec_eval",
+        "-c", "-m", "ndcg_cut.10",
+        "collections/nfcorpus/qrels/test.qrels",
+        trec_output_file_path
+    ]
+    print("ndcg@10 for ", trec_output_file_path)
+    subprocess.run(command)
+
+def run_benchmark(con, trec_output_file_path, metric):
+    """Runs the benchmark and writes results in TREC format."""
+    query_times = []
+    with open(trec_output_file_path, 'w') as trec_file:
+        with open(QUERY_JSONL_FILE_PATH, 'r') as query_file:
+            for line in query_file:
+                data = json.loads(line)
+                query_id = data['id']
+                vector = data['vector']
+
+                # Select appropriate SQL query based on the metric
+                if metric == 'l2sq':
+                    evaluation_metric = 'array_distance'
+                elif metric == 'cosine':
+                    evaluation_metric = 'array_cosine_similarity'
+                elif metric == 'ip':
+                    evaluation_metric = 'array_inner_product'
+
+                sql_query = f"SELECT id, {evaluation_metric}(vector, ?::FLOAT[{len(vector)}]) as score FROM documents ORDER BY score DESC LIMIT ?"
+                # time the execution
+                start_time = time.time()
+                results = con.execute(sql_query, (vector, K)).fetchall()
+                end_time = time.time()
+
+                # Calculate the time for this query and add it to the list
+                query_time = end_time - start_time
+                query_times.append(query_time)
+
+                # Write results in TREC format
+                for rank, (doc_id, score) in enumerate(results, start=1):
+                    trec_file.write(f"{query_id} Q0 {doc_id} {rank} {score} {RUN_ID}\n")
+
+    print(f"TREC results written to {trec_output_file_path}")
+    run_trec_eval(trec_output_file_path)
+    # Aggregate statistics
+    total_time = sum(query_times)
+    mean_time = np.mean(query_times)
+    variance_time = np.var(query_times)
+    min_time = min(query_times)
+    max_time = max(query_times)
+    return total_time, mean_time, variance_time, min_time, max_time
+
+if __name__ == "__main__":
+    con = setup_database()
+
+    # Running the benchmarks
+    print('l2sq: ', run_benchmark(con, TREC_L2SQ_OUTPUT_FILE_PATH, 'l2sq'))
+    print('cosine: ', run_benchmark(con, TREC_COSINE_OUTPUT_FILE_PATH, 'cosine'))
+    print('ip: ', run_benchmark(con, TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, 'ip'))
+
+    # second run
+    print("second run")
+    print('l2sq: ', run_benchmark(con, TREC_L2SQ_OUTPUT_FILE_PATH, 'l2sq'))
+    print('cosine: ', run_benchmark(con, TREC_COSINE_OUTPUT_FILE_PATH, 'cosine'))
+    print('ip: ', run_benchmark(con, TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, 'ip'))
diff --git a/vectordb_benchmark/benchmark_msmarco.sh b/vectordb_benchmark/benchmark_msmarco.sh
@@ -0,0 +1,7 @@
+python3 ./run_benchmark.py \
+--index_name='msmarco-v1-passage.bge-base-en-v1.5' \
+--table_name='msmarco' \
+--metric='ip' \
+--query_index_path='/store/scratch/x59song/Research/pyserini/indexes/msmarco-dev.bge-base-en-v1.5' \
+--db_config_file='duckdb_db_config.txt' \
+--db_type='duckdb' \
+10,973 −0		topics-and-qrels/qrels.dl21-doc-msmarco-v2.1.txt
+349,541 −0		topics-and-qrels/qrels.dl22-doc-msmarco-v2.1.txt
+369,638 −0		topics-and-qrels/qrels.dl22-doc.txt
+15,995 −0		topics-and-qrels/qrels.dl23-doc-msmarco-v2.1.txt
+18,034 −0		topics-and-qrels/qrels.dl23-doc.txt
+22,327 −0		topics-and-qrels/qrels.dl23-passage.txt
+4,702 −0		topics-and-qrels/qrels.msmarco-v2.1-doc.dev.txt
+5,177 −0		topics-and-qrels/qrels.msmarco-v2.1-doc.dev2.txt
+147,328 −0		topics-and-qrels/qrels.rag24.raggy-dev.txt
+ −		topics-and-qrels/topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz
+ −		topics-and-qrels/topics.dl20.cohere-embed-english-v3.0.jsonl.gz
+ −		topics-and-qrels/topics.dl20.openai-ada2-hyde.jsonl.gz
+ −		topics-and-qrels/topics.dl20.openai-ada2.jsonl.gz
+ −		topics-and-qrels/topics.dl23.splade-pp-ed.tsv.gz
+ −		topics-and-qrels/topics.dl23.splade-pp-sd.tsv.gz
+700 −0		topics-and-qrels/topics.dl23.txt
+ −		topics-and-qrels/topics.dl23.unicoil-noexp.0shot.tsv.gz
+ −		topics-and-qrels/topics.dl23.unicoil.0shot.tsv.gz
+ −		topics-and-qrels/topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz
+120 −0		topics-and-qrels/topics.rag24.raggy-dev.txt
+600 −0		topics-and-qrels/topics.rag24.researchy-dev.txt
+301 −0		topics-and-qrels/topics.rag24.test.txt