From 95e7b029874841ab404dfd8583b8d8307d795f0d Mon Sep 17 00:00:00 2001 From: Sean Song Date: Fri, 14 Jun 2024 13:27:34 -0400 Subject: [PATCH 01/21] added benchmarking files and instructions --- benchmark_duckdb.py | 105 +++++++++++++++++++++++++++++++++++++++++ benchmark_pg_vector.py | 103 ++++++++++++++++++++++++++++++++++++++++ instructions.md | 40 ++++++++++++++++ 3 files changed, 248 insertions(+) create mode 100644 benchmark_duckdb.py create mode 100644 benchmark_pg_vector.py create mode 100644 instructions.md diff --git a/benchmark_duckdb.py b/benchmark_duckdb.py new file mode 100644 index 000000000..e0505e9dc --- /dev/null +++ b/benchmark_duckdb.py @@ -0,0 +1,105 @@ +import json +import duckdb +import numpy as np +import subprocess + +# Paths to embedding, query, and output files +DOCUMENT_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/documents/embeddings.jsonl' +QUERY_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/queries/embeddings.jsonl' +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_dot_product.txt' +TREC_COSINE_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_cosine.txt' +TREC_L2SQ_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_l2sq.txt' +K = 10 # Number of nearest neighbors to retrieve +RUN_ID = "DuckDBHNSW" # Identifier for the run + +def get_vector_size(jsonl_file_path): + """Determines the size of the vector, assuming vectors all have the same dimension.""" + with open(jsonl_file_path, 'r') as file: + for line in file: + data = json.loads(line) + vector = data.get('vector', []) + return len(vector) + return 0 + +def insert_data_into_table(con, id, content, vector, table): + """Inserts data into the DuckDB table.""" + con.execute(f"INSERT INTO {table} (id, content, vector) VALUES (?, ?, ?)", (id, content, vector)) + +def setup_database(): + """Sets up the DuckDB database and inserts document data.""" + con = duckdb.connect(database=':memory:') + con.execute("INSTALL vss") + con.execute("LOAD vss") + con.execute("PRAGMA temp_directory='/tmp/duckdb_temp'") + con.execute("PRAGMA memory_limit='4GB'") + + vector_size = get_vector_size(DOCUMENT_JSONL_FILE_PATH) + print(f"Vector size: {vector_size}") + + # Create documents table + con.execute(f""" + CREATE TABLE documents ( + id STRING, + content STRING, + vector FLOAT[{vector_size}] + ) + """) + + # Insert data from JSONL file + with open(DOCUMENT_JSONL_FILE_PATH, 'r') as file: + for line in file: + data = json.loads(line) + insert_data_into_table(con, data['id'], data['contents'], data['vector'], 'documents') + + # Create HNSW indices with different metrics + con.execute("CREATE INDEX l2sq_idx ON documents USING HNSW(vector) WITH (metric = 'l2sq')") + con.execute("CREATE INDEX cos_idx ON documents USING HNSW(vector) WITH (metric = 'cosine')") + con.execute("CREATE INDEX ip_idx ON documents USING HNSW(vector) WITH (metric = 'ip')") + + return con + +def run_trec_eval(trec_output_file_path): + """Runs TREC evaluation and prints ndcg@10.""" + command = [ + "python", "-m", "pyserini.eval.trec_eval", + "-c", "-m", "ndcg_cut.10", + "collections/nfcorpus/qrels/test.qrels", + trec_output_file_path + ] + print("ndcg@10 for ", trec_output_file_path) + subprocess.run(command) + +def run_benchmark(con, trec_output_file_path, metric): + """Runs the benchmark and writes results in TREC format.""" + with open(trec_output_file_path, 'w') as trec_file: + with open(QUERY_JSONL_FILE_PATH, 'r') as query_file: + for line in query_file: + data = json.loads(line) + query_id = data['id'] + vector = data['vector'] + + # Select appropriate SQL query based on the metric + if metric == 'l2sq': + evaluation_metric = 'array_distance' + elif metric == 'cosine': + evaluation_metric = 'array_cosine_similarity' + elif metric == 'ip': + evaluation_metric = 'array_inner_product' + + sql_query = f"SELECT id, {evaluation_metric}(vector, ?::FLOAT[{len(vector)}]) as score FROM documents ORDER BY score DESC LIMIT ?" + results = con.execute(sql_query, (vector, K)).fetchall() + + # Write results in TREC format + for rank, (doc_id, score) in enumerate(results, start=1): + trec_file.write(f"{query_id} Q0 {doc_id} {rank} {score} {RUN_ID}\n") + + print(f"TREC results written to {trec_output_file_path}") + run_trec_eval(trec_output_file_path) + +if __name__ == "__main__": + con = setup_database() + + # Running the benchmarks + run_benchmark(con, TREC_L2SQ_OUTPUT_FILE_PATH, 'l2sq') + run_benchmark(con, TREC_COSINE_OUTPUT_FILE_PATH, 'cosine') + run_benchmark(con, TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, 'ip') diff --git a/benchmark_pg_vector.py b/benchmark_pg_vector.py new file mode 100644 index 000000000..c981edb0c --- /dev/null +++ b/benchmark_pg_vector.py @@ -0,0 +1,103 @@ +import psycopg2 +import json +import subprocess + +# Paths to embedding, query, and output files +DOCUMENT_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/documents/embeddings.jsonl' +QUERY_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/queries/embeddings.jsonl' +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_dot_product.txt' +TREC_COSINE_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_cosine.txt' +TREC_L2SQ_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_l2sq.txt' +VECTOR_SIZE = 768 +K = 10 # Number of nearest neighbors to retrieve +RUN_ID = "PostgresHNSW" + +def insert_data_into_table(cur, id, content, vector): + """Inserts data into the PostgreSQL table.""" + cur.execute("INSERT INTO documents (id, content, vector) VALUES (%s, %s, %s)", (id, content, vector)) + +def setup_database(): + """Sets up the PostgreSQL database and inserts document data.""" + conn = psycopg2.connect( + dbname='main_database', + user='mainuser', + password='password', + host='localhost', + port='5432' + ) + cur = conn.cursor() + + # Create documents table + cur.execute(f""" + CREATE TABLE documents ( + id TEXT PRIMARY KEY, + content TEXT, + vector VECTOR({VECTOR_SIZE}) + ) + """) + conn.commit() + + # Insert data from JSONL file + with open(DOCUMENT_JSONL_FILE_PATH, 'r') as file: + for line in file: + data = json.loads(line) + insert_data_into_table(cur, data['id'], data['contents'], data['vector']) + conn.commit() + + # Create indexes with pgvector + cur.execute("CREATE INDEX ON documents USING ivfflat (vector vector_l2_ops);") + cur.execute("CREATE INDEX ON documents USING ivfflat (vector vector_cosine_ops);") + cur.execute("CREATE INDEX ON documents USING ivfflat (vector vector_ip_ops);") + conn.commit() + + return cur, conn + +def run_trec_eval(trec_output_file_path): + """Runs TREC evaluation and prints ndcg@10.""" + command = [ + "python", "-m", "pyserini.eval.trec_eval", + "-c", "-m", "ndcg_cut.10", + "collections/nfcorpus/qrels/test.qrels", + trec_output_file_path + ] + print("ndcg@10 for ", trec_output_file_path) + subprocess.run(command) + +def run_benchmark(cur, trec_output_file_path, metric): + """Runs the benchmark and writes results in TREC format.""" + with open(trec_output_file_path, 'w') as trec_file: + with open(QUERY_JSONL_FILE_PATH, 'r') as query_file: + for line in query_file: + data = json.loads(line) + query_id = data['id'] + vector = data['vector'] + + # Select appropriate SQL query based on the metric + if metric == 'l2sq': + sql_query = "SELECT id, vector <-> %s::vector AS score FROM documents ORDER BY vector <-> %s::vector LIMIT %s" + elif metric == 'ip': + sql_query = "SELECT id, vector <#> %s::vector AS score FROM documents ORDER BY vector <#> %s::vector LIMIT %s" + elif metric == 'cosine': + sql_query = "SELECT id, vector <=> %s::vector AS score FROM documents ORDER BY vector <=> %s::vector DESC LIMIT %s" + + cur.execute(sql_query, (vector, vector, K)) + results = cur.fetchall() + + # Write results in TREC format + for rank, (doc_id, score) in enumerate(results, start=1): + trec_file.write(f"{query_id} Q0 {doc_id} {rank} {score} {RUN_ID}\n") + + print(f"TREC results written to {trec_output_file_path}") + run_trec_eval(trec_output_file_path) + +if __name__ == "__main__": + cur, conn = setup_database() + + # Running the benchmarks + run_benchmark(cur, TREC_L2SQ_OUTPUT_FILE_PATH, 'l2sq') + run_benchmark(cur, TREC_COSINE_OUTPUT_FILE_PATH, 'cosine') + run_benchmark(cur, TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, 'ip') + + # Close PostgreSQL connection + cur.close() + conn.close() diff --git a/instructions.md b/instructions.md new file mode 100644 index 000000000..989e6390d --- /dev/null +++ b/instructions.md @@ -0,0 +1,40 @@ +# Encoding and Benchmarking Process + +## 1. Encode the Corpus +Create a directory for document embeddings and encode the corpus using the specified encoder. + +```bash +mkdir indexes/non-faiss-nfcorpus/documents +python -m pyserini.encode \ + input --corpus collections/nfcorpus/corpus.jsonl \ + --fields title text \ + output --embeddings indexes/non-faiss-nfcorpus/documents \ + encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ + --device cpu \ + --pooling mean \ + --fields title text \ + --batch 32 +``` + +## 2. Encode the Queries +Create a directory for query embeddings and encode the queries using the specified encoder. + +```bash +mkdir indexes/non-faiss-nfcorpus/queries +python -m pyserini.encode \ + input --corpus collections/nfcorpus/queries.jsonl \ + --fields title text \ + output --embeddings indexes/non-faiss-nfcorpus/queries \ + encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ + --device cpu \ + --pooling mean \ + --fields title text \ + --batch 32 +``` + +## 3. Run Benchmarks + +```bash +python3 benchmark_duckdb.py +python3 benchmark_pgvector.py + From 557f5609ca8ed2204d849e2be1f4c82b4dc6b37a Mon Sep 17 00:00:00 2001 From: Sean Song Date: Fri, 14 Jun 2024 16:16:05 -0400 Subject: [PATCH 02/21] adopted hnsw index and fixed incorrect sql queries --- benchmark_pg_vector.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmark_pg_vector.py b/benchmark_pg_vector.py index c981edb0c..72560861d 100644 --- a/benchmark_pg_vector.py +++ b/benchmark_pg_vector.py @@ -45,10 +45,10 @@ def setup_database(): conn.commit() # Create indexes with pgvector - cur.execute("CREATE INDEX ON documents USING ivfflat (vector vector_l2_ops);") - cur.execute("CREATE INDEX ON documents USING ivfflat (vector vector_cosine_ops);") - cur.execute("CREATE INDEX ON documents USING ivfflat (vector vector_ip_ops);") - conn.commit() + cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_l2_ops);") + cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_cosine_ops);") + cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_ip_ops);") + # conn.commit() return cur, conn @@ -74,13 +74,13 @@ def run_benchmark(cur, trec_output_file_path, metric): # Select appropriate SQL query based on the metric if metric == 'l2sq': - sql_query = "SELECT id, vector <-> %s::vector AS score FROM documents ORDER BY vector <-> %s::vector LIMIT %s" + sql_query = "SELECT id, vector <-> %s::vector AS score FROM documents ORDER BY score LIMIT %s" elif metric == 'ip': - sql_query = "SELECT id, vector <#> %s::vector AS score FROM documents ORDER BY vector <#> %s::vector LIMIT %s" + sql_query = "SELECT id, (vector <#> %s::vector) * -1 AS score FROM documents ORDER BY score DESC LIMIT %s" elif metric == 'cosine': - sql_query = "SELECT id, vector <=> %s::vector AS score FROM documents ORDER BY vector <=> %s::vector DESC LIMIT %s" + sql_query = "SELECT id, 1 - (vector <=> %s::vector) AS score FROM documents ORDER BY score DESC LIMIT %s" - cur.execute(sql_query, (vector, vector, K)) + cur.execute(sql_query, (vector, K)) results = cur.fetchall() # Write results in TREC format From dd128a678e57597627677af1446ad7950ddac48b Mon Sep 17 00:00:00 2001 From: Sean Song Date: Mon, 17 Jun 2024 20:39:55 -0400 Subject: [PATCH 03/21] uncommented code --- benchmark_pg_vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark_pg_vector.py b/benchmark_pg_vector.py index 72560861d..c7d91d101 100644 --- a/benchmark_pg_vector.py +++ b/benchmark_pg_vector.py @@ -48,7 +48,7 @@ def setup_database(): cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_l2_ops);") cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_cosine_ops);") cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_ip_ops);") - # conn.commit() + conn.commit() return cur, conn From 22f9f858665278d30a06212912cc1ddc7b5690ac Mon Sep 17 00:00:00 2001 From: Sean Song Date: Wed, 3 Jul 2024 16:57:39 -0400 Subject: [PATCH 04/21] modified the benchmark files to record performance --- benchmark_duckdb.py | 27 ++++++++++++++++-- benchmark_pg_vector.py | 64 +++++++++++++++++++++++++----------------- nfcorpus_results.txt | 15 ++++++++++ 3 files changed, 78 insertions(+), 28 deletions(-) create mode 100644 nfcorpus_results.txt diff --git a/benchmark_duckdb.py b/benchmark_duckdb.py index e0505e9dc..24389e925 100644 --- a/benchmark_duckdb.py +++ b/benchmark_duckdb.py @@ -2,6 +2,7 @@ import duckdb import numpy as np import subprocess +import time # Paths to embedding, query, and output files DOCUMENT_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/documents/embeddings.jsonl' @@ -52,9 +53,16 @@ def setup_database(): insert_data_into_table(con, data['id'], data['contents'], data['vector'], 'documents') # Create HNSW indices with different metrics + # print the time taken for each index building + start_time = time.time() con.execute("CREATE INDEX l2sq_idx ON documents USING HNSW(vector) WITH (metric = 'l2sq')") + print('building l2sq index: ', time.time() - start_time) + start_time = time.time() con.execute("CREATE INDEX cos_idx ON documents USING HNSW(vector) WITH (metric = 'cosine')") + print('building cosine index: ', time.time() - start_time) + start_time = time.time() con.execute("CREATE INDEX ip_idx ON documents USING HNSW(vector) WITH (metric = 'ip')") + print('building ip index: ', time.time() - start_time) return con @@ -71,6 +79,7 @@ def run_trec_eval(trec_output_file_path): def run_benchmark(con, trec_output_file_path, metric): """Runs the benchmark and writes results in TREC format.""" + total_time = 0 with open(trec_output_file_path, 'w') as trec_file: with open(QUERY_JSONL_FILE_PATH, 'r') as query_file: for line in query_file: @@ -87,7 +96,12 @@ def run_benchmark(con, trec_output_file_path, metric): evaluation_metric = 'array_inner_product' sql_query = f"SELECT id, {evaluation_metric}(vector, ?::FLOAT[{len(vector)}]) as score FROM documents ORDER BY score DESC LIMIT ?" + # time the execution + start_time = time.time() results = con.execute(sql_query, (vector, K)).fetchall() + end_time = time.time() + # aggregate time + total_time += end_time - start_time # Write results in TREC format for rank, (doc_id, score) in enumerate(results, start=1): @@ -95,11 +109,18 @@ def run_benchmark(con, trec_output_file_path, metric): print(f"TREC results written to {trec_output_file_path}") run_trec_eval(trec_output_file_path) + return total_time if __name__ == "__main__": con = setup_database() # Running the benchmarks - run_benchmark(con, TREC_L2SQ_OUTPUT_FILE_PATH, 'l2sq') - run_benchmark(con, TREC_COSINE_OUTPUT_FILE_PATH, 'cosine') - run_benchmark(con, TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, 'ip') + print('l2sq: ', run_benchmark(con, TREC_L2SQ_OUTPUT_FILE_PATH, 'l2sq')) + print('cosine: ', run_benchmark(con, TREC_COSINE_OUTPUT_FILE_PATH, 'cosine')) + print('ip: ', run_benchmark(con, TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, 'ip')) + + # second run + print("second run") + print('l2sq: ', run_benchmark(con, TREC_L2SQ_OUTPUT_FILE_PATH, 'l2sq')) + print('cosine: ', run_benchmark(con, TREC_COSINE_OUTPUT_FILE_PATH, 'cosine')) + print('ip: ', run_benchmark(con, TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, 'ip')) diff --git a/benchmark_pg_vector.py b/benchmark_pg_vector.py index c7d91d101..e8f06040b 100644 --- a/benchmark_pg_vector.py +++ b/benchmark_pg_vector.py @@ -1,6 +1,7 @@ import psycopg2 import json import subprocess +import time # Paths to embedding, query, and output files DOCUMENT_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/documents/embeddings.jsonl' @@ -27,28 +28,34 @@ def setup_database(): ) cur = conn.cursor() - # Create documents table - cur.execute(f""" - CREATE TABLE documents ( - id TEXT PRIMARY KEY, - content TEXT, - vector VECTOR({VECTOR_SIZE}) - ) - """) - conn.commit() - - # Insert data from JSONL file - with open(DOCUMENT_JSONL_FILE_PATH, 'r') as file: - for line in file: - data = json.loads(line) - insert_data_into_table(cur, data['id'], data['contents'], data['vector']) - conn.commit() - - # Create indexes with pgvector - cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_l2_ops);") - cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_cosine_ops);") - cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_ip_ops);") - conn.commit() + # # Create documents table + # cur.execute(f""" + # CREATE TABLE documents ( + # id TEXT PRIMARY KEY, + # content TEXT, + # vector VECTOR({VECTOR_SIZE}) + # ) + # """) + # conn.commit() + + # # Insert data from JSONL file + # with open(DOCUMENT_JSONL_FILE_PATH, 'r') as file: + # for line in file: + # data = json.loads(line) + # insert_data_into_table(cur, data['id'], data['contents'], data['vector']) + # conn.commit() + + # # Create indexes with pgvector + # start_time = time.time() + # cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_l2_ops);") + # print('building l2sq index: ', time.time() - start_time) + # start_time = time.time() + # cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_cosine_ops);") + # print('building cosine index: ', time.time() - start_time) + # start_time = time.time() + # cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_ip_ops);") + # print('building ip index: ', time.time() - start_time) + # conn.commit() return cur, conn @@ -64,6 +71,7 @@ def run_trec_eval(trec_output_file_path): subprocess.run(command) def run_benchmark(cur, trec_output_file_path, metric): + total_time = 0 """Runs the benchmark and writes results in TREC format.""" with open(trec_output_file_path, 'w') as trec_file: with open(QUERY_JSONL_FILE_PATH, 'r') as query_file: @@ -80,8 +88,13 @@ def run_benchmark(cur, trec_output_file_path, metric): elif metric == 'cosine': sql_query = "SELECT id, 1 - (vector <=> %s::vector) AS score FROM documents ORDER BY score DESC LIMIT %s" + # time the execution + start_time = time.time() cur.execute(sql_query, (vector, K)) results = cur.fetchall() + end_time = time.time() + # aggregate the time + total_time += end_time - start_time # Write results in TREC format for rank, (doc_id, score) in enumerate(results, start=1): @@ -89,14 +102,15 @@ def run_benchmark(cur, trec_output_file_path, metric): print(f"TREC results written to {trec_output_file_path}") run_trec_eval(trec_output_file_path) + return total_time if __name__ == "__main__": cur, conn = setup_database() # Running the benchmarks - run_benchmark(cur, TREC_L2SQ_OUTPUT_FILE_PATH, 'l2sq') - run_benchmark(cur, TREC_COSINE_OUTPUT_FILE_PATH, 'cosine') - run_benchmark(cur, TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, 'ip') + print('l2sq: ', run_benchmark(cur, TREC_L2SQ_OUTPUT_FILE_PATH, 'l2sq')) + print('cosine: ', run_benchmark(cur, TREC_COSINE_OUTPUT_FILE_PATH, 'cosine')) + print('ip: ', run_benchmark(cur, TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, 'ip')) # Close PostgreSQL connection cur.close() diff --git a/nfcorpus_results.txt b/nfcorpus_results.txt new file mode 100644 index 000000000..c9008f15f --- /dev/null +++ b/nfcorpus_results.txt @@ -0,0 +1,15 @@ +duckdb: +building l2sq index: 0.3682847023010254 0.3575248718261719 0.35877418518066406 +building cosine index: 0.4233689308166504 0.4250659942626953 0.4125690460205078 +building ip index: 0.35698509216308594 0.326251745223999 0.33107995986938477 +l2sq: 19.746002674102783 21.720022916793823 20.766737937927246 19.952106475830078(second run) +cosine: 22.334033727645874 22.69918704032898 22.870506525039673 22.43225622177124(second run) +ip: 20.792579174041748 19.3823139667511 20.307250261306763 20.414534091949463(second run) + +pg_vector: +building l2sq index: 2.4153892993927 2.3378589153289795 2.276991844177246 +building cosine index: 2.4951090812683105 2.369596004486084 2.459275960922241 +building ip index: 2.471719980239868 2.325632095336914 2.4149928092956543 +l2sq: 5.069890260696411 4.91141152381897 4.930738925933838 4.911103963851929(second run) +cosine: 31.49447011947632 31.42801332473755 33.082948207855225 31.616244316101074(second run) +ip: 28.120339155197144 27.629921197891235 30.123175144195557 29.147559881210327(second run) \ No newline at end of file From 6ba014da9559a25ffc2dc4e7480c0cb8519361a8 Mon Sep 17 00:00:00 2001 From: Sean Song Date: Wed, 3 Jul 2024 17:01:38 -0400 Subject: [PATCH 05/21] removed unnecessary commenting characters --- benchmark_pg_vector.py | 58 ++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/benchmark_pg_vector.py b/benchmark_pg_vector.py index e8f06040b..1b0d64fb2 100644 --- a/benchmark_pg_vector.py +++ b/benchmark_pg_vector.py @@ -28,34 +28,36 @@ def setup_database(): ) cur = conn.cursor() - # # Create documents table - # cur.execute(f""" - # CREATE TABLE documents ( - # id TEXT PRIMARY KEY, - # content TEXT, - # vector VECTOR({VECTOR_SIZE}) - # ) - # """) - # conn.commit() - - # # Insert data from JSONL file - # with open(DOCUMENT_JSONL_FILE_PATH, 'r') as file: - # for line in file: - # data = json.loads(line) - # insert_data_into_table(cur, data['id'], data['contents'], data['vector']) - # conn.commit() - - # # Create indexes with pgvector - # start_time = time.time() - # cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_l2_ops);") - # print('building l2sq index: ', time.time() - start_time) - # start_time = time.time() - # cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_cosine_ops);") - # print('building cosine index: ', time.time() - start_time) - # start_time = time.time() - # cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_ip_ops);") - # print('building ip index: ', time.time() - start_time) - # conn.commit() + # Create documents table + cur.execute(f""" + CREATE TABLE documents ( + id TEXT PRIMARY KEY, + content TEXT, + vector VECTOR({VECTOR_SIZE}) + ) + """) + conn.commit() + + # Insert data from JSONL file + with open(DOCUMENT_JSONL_FILE_PATH, 'r') as file: + for line in file: + data = json.loads(line) + insert_data_into_table(cur, data['id'], data['contents'], data['vector']) + conn.commit() + + # Create indexes with pgvector + start_time = time.time() + cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_l2_ops);") + print('building l2sq index: ', time.time() - start_time) + + start_time = time.time() + cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_cosine_ops);") + print('building cosine index: ', time.time() - start_time) + + start_time = time.time() + cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_ip_ops);") + print('building ip index: ', time.time() - start_time) + conn.commit() return cur, conn From 67d86cacc105fbc3cac2bfde029e8c6b5f0ad437 Mon Sep 17 00:00:00 2001 From: Sean Song Date: Sun, 21 Jul 2024 22:00:53 -0400 Subject: [PATCH 06/21] faiss_to_pgvector file created --- faiss_to_pgvector.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 faiss_to_pgvector.py diff --git a/faiss_to_pgvector.py b/faiss_to_pgvector.py new file mode 100644 index 000000000..bc87b0ab8 --- /dev/null +++ b/faiss_to_pgvector.py @@ -0,0 +1,23 @@ +from pyserini.util import download_prebuilt_index +import faiss +import numpy as np + +# Path to the FAISS index file +index_path = '/Users/seansong/.cache/pyserini/indexes/faiss-flat.msmarco-v1-passage.bge-base-en-v1.5.20240107.b21fb6abee3be6da3b6f39c9f6d9f280/index' + +# Read the index from the file +index = faiss.read_index(index_path) + +# Retrieve the vectors from the index +# Assuming the index is flat, use index.reconstruct_n to get all vectors +num_vectors = index.ntotal # Total number of vectors in the index +dim = index.d # Dimension of the vectors + +vectors = np.zeros((num_vectors, dim), dtype='float32') +# for i in range(num_vectors): +# vectors[i] = index.reconstruct(i) +try: + index_dir = download_prebuilt_index('msmarco-v1-passage.bge-base-en-v1.5', verbose=True) +except ValueError as e: + print(str(e)) + exit(1) From a1e3ed0f02805e700bc3e9e766519d14ece6cfe5 Mon Sep 17 00:00:00 2001 From: x59song Date: Sun, 21 Jul 2024 22:03:18 -0400 Subject: [PATCH 07/21] updated git ignore to ignore venv files --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index b7fb0a2b4..989f63e85 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,9 @@ runs/ # logs should also be ignored logs/ + +# binaries should also be ignored +bin/* +lib* +pyvenv* +share* From bc0669c60c181fd82852d0f585853835f5729a8f Mon Sep 17 00:00:00 2001 From: x59song Date: Wed, 31 Jul 2024 14:54:11 -0400 Subject: [PATCH 08/21] refactored benchmark scripts, and added vector extraction tool --- benchmark_duckdb.py | 16 ++++++++--- benchmark_pg_vector.py | 17 +++++++++--- collections/.gitkeep | 1 - extract.sh | 56 ++++++++++++++++++++++++++++++++++++++ faiss_to_pgvector.py | 23 ---------------- faiss_vector_extractor.py | 57 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 138 insertions(+), 32 deletions(-) delete mode 100644 collections/.gitkeep create mode 100644 extract.sh delete mode 100644 faiss_to_pgvector.py create mode 100644 faiss_vector_extractor.py diff --git a/benchmark_duckdb.py b/benchmark_duckdb.py index 24389e925..acbf3d1c6 100644 --- a/benchmark_duckdb.py +++ b/benchmark_duckdb.py @@ -79,7 +79,7 @@ def run_trec_eval(trec_output_file_path): def run_benchmark(con, trec_output_file_path, metric): """Runs the benchmark and writes results in TREC format.""" - total_time = 0 + query_times = [] with open(trec_output_file_path, 'w') as trec_file: with open(QUERY_JSONL_FILE_PATH, 'r') as query_file: for line in query_file: @@ -100,8 +100,10 @@ def run_benchmark(con, trec_output_file_path, metric): start_time = time.time() results = con.execute(sql_query, (vector, K)).fetchall() end_time = time.time() - # aggregate time - total_time += end_time - start_time + + # Calculate the time for this query and add it to the list + query_time = end_time - start_time + query_times.append(query_time) # Write results in TREC format for rank, (doc_id, score) in enumerate(results, start=1): @@ -109,7 +111,13 @@ def run_benchmark(con, trec_output_file_path, metric): print(f"TREC results written to {trec_output_file_path}") run_trec_eval(trec_output_file_path) - return total_time + # Aggregate statistics + total_time = sum(query_times) + mean_time = np.mean(query_times) + variance_time = np.var(query_times) + min_time = min(query_times) + max_time = max(query_times) + return total_time, mean_time, variance_time, min_time, max_time if __name__ == "__main__": con = setup_database() diff --git a/benchmark_pg_vector.py b/benchmark_pg_vector.py index 1b0d64fb2..2cff06e69 100644 --- a/benchmark_pg_vector.py +++ b/benchmark_pg_vector.py @@ -1,3 +1,4 @@ +import numpy as np import psycopg2 import json import subprocess @@ -73,7 +74,7 @@ def run_trec_eval(trec_output_file_path): subprocess.run(command) def run_benchmark(cur, trec_output_file_path, metric): - total_time = 0 + query_times = [] """Runs the benchmark and writes results in TREC format.""" with open(trec_output_file_path, 'w') as trec_file: with open(QUERY_JSONL_FILE_PATH, 'r') as query_file: @@ -95,8 +96,10 @@ def run_benchmark(cur, trec_output_file_path, metric): cur.execute(sql_query, (vector, K)) results = cur.fetchall() end_time = time.time() - # aggregate the time - total_time += end_time - start_time + + # Calculate the time for this query and add it to the list + query_time = end_time - start_time + query_times.append(query_time) # Write results in TREC format for rank, (doc_id, score) in enumerate(results, start=1): @@ -104,7 +107,13 @@ def run_benchmark(cur, trec_output_file_path, metric): print(f"TREC results written to {trec_output_file_path}") run_trec_eval(trec_output_file_path) - return total_time + # Aggregate statistics + total_time = sum(query_times) + mean_time = np.mean(query_times) + variance_time = np.var(query_times) + min_time = min(query_times) + max_time = max(query_times) + return total_time, mean_time, variance_time, min_time, max_time if __name__ == "__main__": cur, conn = setup_database() diff --git a/collections/.gitkeep b/collections/.gitkeep deleted file mode 100644 index b1adcd339..000000000 --- a/collections/.gitkeep +++ /dev/null @@ -1 +0,0 @@ -# This is the default directory for document collections. Placeholder so that directory is kept in git. \ No newline at end of file diff --git a/extract.sh b/extract.sh new file mode 100644 index 000000000..53b14749b --- /dev/null +++ b/extract.sh @@ -0,0 +1,56 @@ +# Example usage commands to extract a batch of vectors from the FAISS index +python3 faiss_vector_extractor.py \ +--index_name='msmarco-v1-passage.bge-base-en-v1.5' \ +--output_file='msmarco-batch2.txt' \ +--batch_size=10000 \ +--start_id=1000000 \ +--num_batches=100 + +python3 faiss_vector_extractor.py \ +--index_name='msmarco-v1-passage.bge-base-en-v1.5' \ +--output_file='msmarco-batch2.txt' \ +--batch_size=10000 \ +--start_id=2000000 \ +--num_batches=100 + +python3 faiss_vector_extractor.py \ +--index_name='msmarco-v1-passage.bge-base-en-v1.5' \ +--output_file='msmarco-batch2.txt' \ +--batch_size=10000 \ +--start_id=3000000 \ +--num_batches=100 + +python3 faiss_vector_extractor.py \ +--index_name='msmarco-v1-passage.bge-base-en-v1.5' \ +--output_file='msmarco-batch2.txt' \ +--batch_size=10000 \ +--start_id=4000000 \ +--num_batches=100 + +python3 faiss_vector_extractor.py \ +--index_name='msmarco-v1-passage.bge-base-en-v1.5' \ +--output_file='msmarco-batch2.txt' \ +--batch_size=10000 \ +--start_id=5000000 \ +--num_batches=100 + +python3 faiss_vector_extractor.py \ +--index_name='msmarco-v1-passage.bge-base-en-v1.5' \ +--output_file='msmarco-batch2.txt' \ +--batch_size=10000 \ +--start_id=6000000 \ +--num_batches=100 + +python3 faiss_vector_extractor.py \ +--index_name='msmarco-v1-passage.bge-base-en-v1.5' \ +--output_file='msmarco-batch2.txt' \ +--batch_size=10000 \ +--start_id=7000000 \ +--num_batches=100 + +python3 faiss_vector_extractor.py \ +--index_name='msmarco-v1-passage.bge-base-en-v1.5' \ +--output_file='msmarco-batch2.txt' \ +--batch_size=10000 \ +--start_id=8000000 \ +--num_batches=100 \ No newline at end of file diff --git a/faiss_to_pgvector.py b/faiss_to_pgvector.py deleted file mode 100644 index bc87b0ab8..000000000 --- a/faiss_to_pgvector.py +++ /dev/null @@ -1,23 +0,0 @@ -from pyserini.util import download_prebuilt_index -import faiss -import numpy as np - -# Path to the FAISS index file -index_path = '/Users/seansong/.cache/pyserini/indexes/faiss-flat.msmarco-v1-passage.bge-base-en-v1.5.20240107.b21fb6abee3be6da3b6f39c9f6d9f280/index' - -# Read the index from the file -index = faiss.read_index(index_path) - -# Retrieve the vectors from the index -# Assuming the index is flat, use index.reconstruct_n to get all vectors -num_vectors = index.ntotal # Total number of vectors in the index -dim = index.d # Dimension of the vectors - -vectors = np.zeros((num_vectors, dim), dtype='float32') -# for i in range(num_vectors): -# vectors[i] = index.reconstruct(i) -try: - index_dir = download_prebuilt_index('msmarco-v1-passage.bge-base-en-v1.5', verbose=True) -except ValueError as e: - print(str(e)) - exit(1) diff --git a/faiss_vector_extractor.py b/faiss_vector_extractor.py new file mode 100644 index 000000000..aa8d799e8 --- /dev/null +++ b/faiss_vector_extractor.py @@ -0,0 +1,57 @@ +from pyserini.util import download_prebuilt_index +from pyserini.search import FaissSearcher +import argparse +import numpy as np + +# Path to the FAISS index file +# index_path = '/u4/x59song/.cache/pyserini/indexes/faiss-flat.msmarco-v1-passage.bge-base-en-v1.5.20240107.b21fb6abee3be6da3b6f39c9f6d9f280/index' + +import faiss + +class FaissVectorExtractor: + def __init__(self, index_name, output_file_path, batch_size=10000, start_id=0, num_batches=1): + try: + index_dir = download_prebuilt_index(index_name, verbose=True) + except ValueError as e: + print(str(e)) + exit(1) + self.index_file_path = index_dir + '/index' + self.output_file_path = output_file_path + self.batch_size = batch_size + self.index = None + self.start_id = start_id + self.num_batches = num_batches + + def load_index(self): + self.index = faiss.read_index(self.index_file_path) + if not self.index: + raise Exception(f"Failed to load index from {self.index_file_path}") + + def extract_vectors(self): + if self.index is None: + self.load_index() + + with open(self.output_file_path, "w") as f: + for batch_start in range(self.start_id, min(self.index.ntotal, self.start_id + self.num_batches * self.batch_size), self.batch_size): + batch_end = min(batch_start + self.batch_size, self.index.ntotal) + + # reconstruct 1000 vectors most, so avoid memory overflow + vectors = self.index.reconstruct_n(batch_start, batch_end - batch_start) + for i, vector in enumerate(vectors): + vector_str = ",".join(map(str, vector)) + f.write(f"{batch_start + i}\t{vector_str}\n") + + print(f"Mappings have been written to {self.output_file_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='FAISS Vector Extractor') + parser.add_argument('--index_name', type=str, required=True, help='name of the FAISS index file') + parser.add_argument('--output_file', type=str, required=True, help='Path to the output file for docid to vector mappings') + parser.add_argument('--batch_size', type=int, default=10000, help='Batch size for processing vectors (default: 10000)') + parser.add_argument('--start_id', type=int, default=0, help='Start ID for processing vectors (default: 0)') + parser.add_argument('--num_batches', type=int, default=1, help='Number of batches to process (default: 1)') + + args = parser.parse_args() + + extractor = FaissVectorExtractor(args.index_name, args.output_file, args.batch_size, args.start_id, args.num_batches) + extractor.extract_vectors() From b077548ebd50d2a8820555ad131bcf0ecfbba592 Mon Sep 17 00:00:00 2001 From: song Date: Tue, 10 Sep 2024 21:40:35 -0400 Subject: [PATCH 09/21] basic set up for benchmarking --- .gitignore | 2 + duckdb_in_memory.py | 19 +++ duckdb_server.py | 17 +++ extract.sh | 56 --------- faiss_vector_extractor.py | 57 --------- msmarco_benchmark.py | 27 ++++ scripts/msmarco-passage/encode_queries.py | 0 tools | 2 +- .../benchmark_duckdb.py | 0 vectordb_benchmark/benchmark_msmarco.sh | 6 + .../benchmark_pg_vector.py | 0 .../duckdb_faiss_index_adaptor.py | 116 ++++++++++++++++++ vectordb_benchmark/encode_msmarco.sh | 13 ++ vectordb_benchmark/evaluate_trec.py | 9 ++ vectordb_benchmark/faiss_index_adaptor.py | 60 +++++++++ vectordb_benchmark/faiss_vector_extractor.py | 63 ++++++++++ .../pgvector_faiss_index_adaptor.py | 106 ++++++++++++++++ vectordb_benchmark/run_benchmark.py | 44 +++++++ 18 files changed, 483 insertions(+), 114 deletions(-) create mode 100644 duckdb_in_memory.py create mode 100644 duckdb_server.py delete mode 100644 extract.sh delete mode 100644 faiss_vector_extractor.py create mode 100644 msmarco_benchmark.py mode change 100644 => 100755 scripts/msmarco-passage/encode_queries.py rename benchmark_duckdb.py => vectordb_benchmark/benchmark_duckdb.py (100%) create mode 100755 vectordb_benchmark/benchmark_msmarco.sh rename benchmark_pg_vector.py => vectordb_benchmark/benchmark_pg_vector.py (100%) create mode 100644 vectordb_benchmark/duckdb_faiss_index_adaptor.py create mode 100644 vectordb_benchmark/encode_msmarco.sh create mode 100644 vectordb_benchmark/evaluate_trec.py create mode 100644 vectordb_benchmark/faiss_index_adaptor.py create mode 100644 vectordb_benchmark/faiss_vector_extractor.py create mode 100644 vectordb_benchmark/pgvector_faiss_index_adaptor.py create mode 100644 vectordb_benchmark/run_benchmark.py diff --git a/.gitignore b/.gitignore index 989f63e85..b4dd91734 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ collections/* indexes/* .vscode/ venv/ +*.txt +*.duckdb # build directories from `python3 setup.py sdist bdist_wheel` build/ diff --git a/duckdb_in_memory.py b/duckdb_in_memory.py new file mode 100644 index 000000000..dd88cc915 --- /dev/null +++ b/duckdb_in_memory.py @@ -0,0 +1,19 @@ +import duckdb +import faiss_vector_extractor + +# Open the file-based DuckDB database +file_con = duckdb.connect('my_database.duckdb') + +# Create an in-memory DuckDB database +mem_con = duckdb.connect(database=':memory:') + +# Extract data from the file-based msmarco table into a Pandas DataFrame +df = file_con.execute("SELECT * FROM msmarco").fetchdf() + +# Register the DataFrame in the in-memory DuckDB database +mem_con.register('msmarco', df) + +# Now you can create the HNSW index on the msmarco table in the in-memory database +mem_con.execute(f"CREATE INDEX hnsw_idx ON msmarco USING HNSW(vector) WITH (metric = 'ip')") + +# Continue with your operations... diff --git a/duckdb_server.py b/duckdb_server.py new file mode 100644 index 000000000..49976f19e --- /dev/null +++ b/duckdb_server.py @@ -0,0 +1,17 @@ +import duckdb +from flask import Flask, request, jsonify + +app = Flask(__name__) +con = duckdb.connect('my_database.duckdb') + +@app.route('/query', methods=['POST']) +def query_duckdb(): + query = request.json.get('query') + try: + result = con.execute(query).fetchdf() + return result.to_json(orient='split') + except Exception as e: + return jsonify({'error': str(e)}), 400 + +if __name__ == '__main__': + app.run(port=5000) diff --git a/extract.sh b/extract.sh deleted file mode 100644 index 53b14749b..000000000 --- a/extract.sh +++ /dev/null @@ -1,56 +0,0 @@ -# Example usage commands to extract a batch of vectors from the FAISS index -python3 faiss_vector_extractor.py \ ---index_name='msmarco-v1-passage.bge-base-en-v1.5' \ ---output_file='msmarco-batch2.txt' \ ---batch_size=10000 \ ---start_id=1000000 \ ---num_batches=100 - -python3 faiss_vector_extractor.py \ ---index_name='msmarco-v1-passage.bge-base-en-v1.5' \ ---output_file='msmarco-batch2.txt' \ ---batch_size=10000 \ ---start_id=2000000 \ ---num_batches=100 - -python3 faiss_vector_extractor.py \ ---index_name='msmarco-v1-passage.bge-base-en-v1.5' \ ---output_file='msmarco-batch2.txt' \ ---batch_size=10000 \ ---start_id=3000000 \ ---num_batches=100 - -python3 faiss_vector_extractor.py \ ---index_name='msmarco-v1-passage.bge-base-en-v1.5' \ ---output_file='msmarco-batch2.txt' \ ---batch_size=10000 \ ---start_id=4000000 \ ---num_batches=100 - -python3 faiss_vector_extractor.py \ ---index_name='msmarco-v1-passage.bge-base-en-v1.5' \ ---output_file='msmarco-batch2.txt' \ ---batch_size=10000 \ ---start_id=5000000 \ ---num_batches=100 - -python3 faiss_vector_extractor.py \ ---index_name='msmarco-v1-passage.bge-base-en-v1.5' \ ---output_file='msmarco-batch2.txt' \ ---batch_size=10000 \ ---start_id=6000000 \ ---num_batches=100 - -python3 faiss_vector_extractor.py \ ---index_name='msmarco-v1-passage.bge-base-en-v1.5' \ ---output_file='msmarco-batch2.txt' \ ---batch_size=10000 \ ---start_id=7000000 \ ---num_batches=100 - -python3 faiss_vector_extractor.py \ ---index_name='msmarco-v1-passage.bge-base-en-v1.5' \ ---output_file='msmarco-batch2.txt' \ ---batch_size=10000 \ ---start_id=8000000 \ ---num_batches=100 \ No newline at end of file diff --git a/faiss_vector_extractor.py b/faiss_vector_extractor.py deleted file mode 100644 index aa8d799e8..000000000 --- a/faiss_vector_extractor.py +++ /dev/null @@ -1,57 +0,0 @@ -from pyserini.util import download_prebuilt_index -from pyserini.search import FaissSearcher -import argparse -import numpy as np - -# Path to the FAISS index file -# index_path = '/u4/x59song/.cache/pyserini/indexes/faiss-flat.msmarco-v1-passage.bge-base-en-v1.5.20240107.b21fb6abee3be6da3b6f39c9f6d9f280/index' - -import faiss - -class FaissVectorExtractor: - def __init__(self, index_name, output_file_path, batch_size=10000, start_id=0, num_batches=1): - try: - index_dir = download_prebuilt_index(index_name, verbose=True) - except ValueError as e: - print(str(e)) - exit(1) - self.index_file_path = index_dir + '/index' - self.output_file_path = output_file_path - self.batch_size = batch_size - self.index = None - self.start_id = start_id - self.num_batches = num_batches - - def load_index(self): - self.index = faiss.read_index(self.index_file_path) - if not self.index: - raise Exception(f"Failed to load index from {self.index_file_path}") - - def extract_vectors(self): - if self.index is None: - self.load_index() - - with open(self.output_file_path, "w") as f: - for batch_start in range(self.start_id, min(self.index.ntotal, self.start_id + self.num_batches * self.batch_size), self.batch_size): - batch_end = min(batch_start + self.batch_size, self.index.ntotal) - - # reconstruct 1000 vectors most, so avoid memory overflow - vectors = self.index.reconstruct_n(batch_start, batch_end - batch_start) - for i, vector in enumerate(vectors): - vector_str = ",".join(map(str, vector)) - f.write(f"{batch_start + i}\t{vector_str}\n") - - print(f"Mappings have been written to {self.output_file_path}") - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='FAISS Vector Extractor') - parser.add_argument('--index_name', type=str, required=True, help='name of the FAISS index file') - parser.add_argument('--output_file', type=str, required=True, help='Path to the output file for docid to vector mappings') - parser.add_argument('--batch_size', type=int, default=10000, help='Batch size for processing vectors (default: 10000)') - parser.add_argument('--start_id', type=int, default=0, help='Start ID for processing vectors (default: 0)') - parser.add_argument('--num_batches', type=int, default=1, help='Number of batches to process (default: 1)') - - args = parser.parse_args() - - extractor = FaissVectorExtractor(args.index_name, args.output_file, args.batch_size, args.start_id, args.num_batches) - extractor.extract_vectors() diff --git a/msmarco_benchmark.py b/msmarco_benchmark.py new file mode 100644 index 000000000..2720731ca --- /dev/null +++ b/msmarco_benchmark.py @@ -0,0 +1,27 @@ +import argparse +import faiss +import faiss_vector_extractor + +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = "/store/scratch/x59song/trec_dot_product_output.txt" + +def run_benchmark(trec_output_file_path, metric, query_index_path, adaptor): + query_vector_map = load_index_and_docids(query_index_path) + adaptor.run_benchmark(query_vector_map, table_name, metric, 20, 768, trec_output_file_path) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='FAISS Vector DB Index Constructor') + parser.add_argument('--index_name', type=str, required=True, help='name of the FAISS index file') + parser.add_argument('--metric', type=str, required=True, help='metric of the FAISS index') + parser.add_argument('--table_name', type=str, required=True, help='name of the table to store the vectors') + args = parser.parse_args() + + DBConfig = { + 'temp_directory': '/store/scratch/x59song/temp', + 'memory_limit': '50GB' + } + + adaptor = DuckDBVectorDBFaissIndexAdaptor(args.index_name, DBConfig) + adaptor.extract_vectors_and_construct_index(args.table_name, args.metric) + run_benchmark(TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, args.metric, args.index_name, adaptor) + + \ No newline at end of file diff --git a/scripts/msmarco-passage/encode_queries.py b/scripts/msmarco-passage/encode_queries.py old mode 100644 new mode 100755 diff --git a/tools b/tools index d4f2be22d..3a2b3cc5c 160000 --- a/tools +++ b/tools @@ -1 +1 @@ -Subproject commit d4f2be22d4a9d19ef375a700ea1068a0dc877051 +Subproject commit 3a2b3cc5cfd915d707408aa5c4567185e9e4544f diff --git a/benchmark_duckdb.py b/vectordb_benchmark/benchmark_duckdb.py similarity index 100% rename from benchmark_duckdb.py rename to vectordb_benchmark/benchmark_duckdb.py diff --git a/vectordb_benchmark/benchmark_msmarco.sh b/vectordb_benchmark/benchmark_msmarco.sh new file mode 100755 index 000000000..0abb31495 --- /dev/null +++ b/vectordb_benchmark/benchmark_msmarco.sh @@ -0,0 +1,6 @@ +python3 vectordb_benchmark/run_benchmark.py \ +--index_name='msmarco-v1-passage.bge-base-en-v1.5' \ +--table_name='msmarco' \ +--metric='ip' \ +--query_index_path='/store/scratch/x59song/Research/pyserini/indexes/msmarco-dev.bge-base-en-v1.5' \ +--db_type='pgvector' \ diff --git a/benchmark_pg_vector.py b/vectordb_benchmark/benchmark_pg_vector.py similarity index 100% rename from benchmark_pg_vector.py rename to vectordb_benchmark/benchmark_pg_vector.py diff --git a/vectordb_benchmark/duckdb_faiss_index_adaptor.py b/vectordb_benchmark/duckdb_faiss_index_adaptor.py new file mode 100644 index 000000000..2777fd429 --- /dev/null +++ b/vectordb_benchmark/duckdb_faiss_index_adaptor.py @@ -0,0 +1,116 @@ +import faiss_index_adaptor +import duckdb +import faiss_vector_extractor +import time +import numpy as np + +class DuckDBVectorDBFaissIndexAdaptor(faiss_index_adaptor.VectorDBFaissIndexAdaptor): + def __init__(self, index_name, DBConfig): + super().__init__(index_name, DBConfig) + self.con = None + + def initialize_database_and_table(self, table_name, DBConfig, vector_size): + memory_limit = DBConfig['memory_limit'] + self.con = duckdb.connect(database=':memory:') + self.con.execute("INSTALL vss") + self.con.execute("LOAD vss") + self.con.execute(f"PRAGMA memory_limit='{memory_limit}'") + + # Create documents table + self.con.execute(f""" + CREATE TABLE {table_name} ( + id INT, + vector FLOAT[{vector_size}] + ) + """) + print(f"created table {table_name}") + + def construct_index(self, table_name, metric): + self.con.execute(f"CREATE INDEX {metric}_idx ON {table_name} USING HNSW(vector) WITH (metric = '{metric}')") + print(f"Index constructed for {table_name} using {metric} metric") + + def insert_vector_map_into_table(self, table_name, metric): + start_time = time.time() + for id, vector in self.vector_map.items(): + self.con.execute(f"INSERT INTO {table_name} (id, vector) VALUES (?, ?)", (id, vector)) + self.con.commit() + end_time = time.time() + print(f"Inserted {len(self.vector_map)} vectors into {table_name} in {end_time - start_time} seconds") + + def get_connection(self): + return self.con + + # close the connection + def close(self): + self.con.close() + + def run_benchmark(self, table_name, metric, K, vector_size, trec_file_path, query_vector_map=None): + print(f"running benchmark for {table_name} with metric {metric}") + # Select appropriate SQL query based on the metric + if metric == 'l2sq': + evaluation_metric = 'array_distance' + elif metric == 'cosine': + evaluation_metric = 'array_cosine_similarity' + elif metric == 'ip': + evaluation_metric = 'array_inner_product' + with open(trec_file_path, 'w') as trec_file: + count = 0 + query_times = [] + for query_id, query_vector in query_vector_map.items(): + sql_query = f"SELECT id, {evaluation_metric}(vector, ?::FLOAT[{vector_size}]) as score FROM {table_name} ORDER BY score DESC LIMIT ?" + # time the execution + start_time = time.time() + results = self.con.execute(sql_query, (query_vector, K)).fetchall() + end_time = time.time() + + # Calculate the time for this query and add it to the list + query_time = end_time - start_time + query_times.append(query_time) + + # Write results in TREC format + for rank, (doc_id, score) in enumerate(results, start=1): + trec_file.write(f"{query_id} Q0 {doc_id} {rank} {score} DuckDB\n") + count += 1 + if count % 100 == 0: + print(f"processed {count} queries") + + print(f"TREC results written to {trec_file_path}") + ans = self.run_trec_eval(trec_file_path) + # Aggregate statistics + total_time = sum(query_times) + mean_time = np.mean(query_times) + variance_time = np.var(query_times) + min_time = min(query_times) + max_time = max(query_times) + # create a file to store results + with open(f"{table_name}_benchmark_results.txt", "w") as f: + f.write(f"Total time: {total_time}\n") + f.write(f"Mean time: {mean_time}\n") + f.write(f"Variance time: {variance_time}\n") + f.write(f"Min time: {min_time}\n") + f.write(f"Max time: {max_time}\n") + f.write(f"TREC eval output: {ans}\n") + return total_time, mean_time, variance_time, min_time, max_time + + def create_in_memory_hnsw_index_on_file(self, file_path, table_name): + # Open the file-based DuckDB database + file_con = duckdb.connect(file_path) + + # Create an in-memory DuckDB database + self.con = duckdb.connect(database=':memory:') + self.con.execute("INSTALL vss") + self.con.execute("LOAD vss") + + # Extract data from the file-based table into a Pandas DataFrame + df = file_con.execute(f"SELECT * FROM {table_name}").fetchdf() + + df['vector'] = df['vector'].apply(lambda x: x if isinstance(x, list) else list(x)) + + # Create a new table in the in-memory DuckDB database + self.con.execute("CREATE TABLE msmarco AS SELECT * FROM df") + + # Cast the vector column to the required FLOAT[N] type if necessary + self.con.execute("ALTER TABLE msmarco ALTER COLUMN vector SET DATA TYPE FLOAT[]") + + # Now you can create the HNSW index on the msmarco table in the in-memory database + self.con.execute(f"CREATE INDEX hnsw_idx ON msmarco USING HNSW(vector) WITH (metric = 'ip')") diff --git a/vectordb_benchmark/encode_msmarco.sh b/vectordb_benchmark/encode_msmarco.sh new file mode 100644 index 000000000..dfc8ac59f --- /dev/null +++ b/vectordb_benchmark/encode_msmarco.sh @@ -0,0 +1,13 @@ +python3 tools/scripts/msmarco-passage/encode_queries.py \ + --encoder=bge-base-en-v1.5 \ + --input=collections/msmarco-passage/queries.dev.small.tsv \ + --output=collections/faiss-queries/msmarco-passage/queries.pkl + +python -m pyserini.encode \ + input --corpus collections/faiss-queries/msmarco-passage/queries.jsonl \ + output --embeddings indexes/msmarco-dev.bge-base-en-v1.5 \ + --to-faiss \ + encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ + --device cpu \ + --pooling mean \ + --batch 32 \ No newline at end of file diff --git a/vectordb_benchmark/evaluate_trec.py b/vectordb_benchmark/evaluate_trec.py new file mode 100644 index 000000000..6f3bcaddd --- /dev/null +++ b/vectordb_benchmark/evaluate_trec.py @@ -0,0 +1,9 @@ +import subprocess + +command = [ + "python", "-m", "pyserini.eval.trec_eval", + "-c", "-M", "10", "-m", "recip_rank", + "../collections/msmarco-passage/qrels.dev.small.trec", + '../trec_dot_product_output.txt' + ] +subprocess.run(command) \ No newline at end of file diff --git a/vectordb_benchmark/faiss_index_adaptor.py b/vectordb_benchmark/faiss_index_adaptor.py new file mode 100644 index 000000000..08eb06fcb --- /dev/null +++ b/vectordb_benchmark/faiss_index_adaptor.py @@ -0,0 +1,60 @@ +# A virtual class that takes in an index name, and creates a FaissVectorExtractor, +# Extract the vectors, and then construct a vector db index +import faiss_vector_extractor +import time +import subprocess + +class VectorDBFaissIndexAdaptor: + def __init__(self, index_name, DBConfig): + self.index_name = index_name + self.extractor = faiss_vector_extractor.FaissVectorExtractor(index_name) + self.vector_map = None + self.table_name = None + self.DBConfig = DBConfig + + def extract_vectors_and_construct_index(self, table_name, metric, extract_all_vectors=False, vector_size=768): + self.initialize_database_and_table(table_name, self.DBConfig, vector_size) + # if extract all vectors, extract all, otherwise extract by batch via a while loop + # if extract_all_vectors: + # self.vector_map = self.extractor.extract_all_vectors() + # self.insert_vector_map_into_table(table_name, metric) + # else: + # startid = 0 + # batch_size = 100000 + # self.extractor.load_index() + # while startid < self.extractor.index.ntotal: + # # time extraction + # start_time = time.time() + # self.vector_map = self.extractor.extract_one_batch_of_vectors(startid, batch_size) + # end_time = time.time() + # print(f"Extracted {batch_size} vectors in {end_time - start_time} seconds") + # self.insert_vector_map_into_table(table_name, metric) + # startid += batch_size + self.construct_index(table_name, metric) + + def insert_vector_map_into_table(self, table_name, metric): + pass + + def construct_index(self, table_name, metric): + pass + + def initialize_database_and_table(self, table_name, DBConfig, vector_size): + pass + + def get_connection(self): + pass + + def run_benchmark(self, table_name, metric, K, vector_size, trec_file_path, query_vector_map=None): + pass + + def run_trec_eval(self, trec_output_file_path): + """Runs TREC evaluation and prints ndcg@10.""" + command = [ + "python", "-m", "pyserini.eval.trec_eval", + "-c", "-M", "10", "-m", "recip_rank", + "collections/msmarco-passage/qrels.dev.small.trec", + trec_output_file_path + ] + return subprocess.run(command) + + diff --git a/vectordb_benchmark/faiss_vector_extractor.py b/vectordb_benchmark/faiss_vector_extractor.py new file mode 100644 index 000000000..73d9cbce1 --- /dev/null +++ b/vectordb_benchmark/faiss_vector_extractor.py @@ -0,0 +1,63 @@ +from pyserini.util import download_prebuilt_index +from pyserini.search import FaissSearcher +import argparse +import numpy as np +import duckdb +import faiss + +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = 'runs/.run-faiss-msmarco-passage-result_dot_product.txt' + +class FaissVectorExtractor: + def __init__(self, index_name): + try: + index_dir = download_prebuilt_index(index_name, verbose=True) + except ValueError as e: + print(str(e)) + exit(1) + self.index_file_path = index_dir + '/index' + self.docid_file_path = index_dir + '/docid' + self.index = None + self.docids = None + + def load_index(self): + self.index = faiss.read_index(self.index_file_path) + self.docids = FaissSearcher.load_docids(self.docid_file_path) + if not self.index: + raise Exception(f"Failed to load index from {self.index_file_path}") + + def extract_all_vectors(self): + if self.index is None: + self.load_index() + vectors = self.index.reconstruct_n(0, self.index.ntotal) + vector_map = {self.docids[i]: vector for i, vector in enumerate(vectors)} + + print("Finished loading index and reconstructed all vectors") + return vector_map + + def extract_one_batch_of_vectors(self, start_id, batch_size): + if self.index is None: + print("Loading index") + self.load_index() + batch_end = min(start_id + batch_size, self.index.ntotal) + vectors = self.index.reconstruct_n(start_id, batch_end - start_id) + vector_map = {self.docids[i+start_id]: vector for i, vector in enumerate(vectors)} + return vector_map + +def load_index_and_docids(query_index_path): + docids = FaissSearcher.load_docids(query_index_path + '/docid') + index = faiss.read_index(query_index_path + '/index') + vector_map = {} + for i in range(index.ntotal): + docid = docids[i] + vector = index.reconstruct(i) + vector_map[docid] = vector + return vector_map + +def run_benchmark(trec_output_file_path, metric, query_index_path, adaptor, table_name): + query_vector_map = load_index_and_docids(query_index_path) + adaptor.run_benchmark(table_name, metric, 20, 768, trec_output_file_path, query_vector_map) + +def run_benchmark_on_file(trec_output_file_path, metric, file_path, table_name, adaptor): + adaptor.create_in_memory_hnsw_index_on_file(file_path, table_name) + adaptor.run_benchmark(table_name, metric, 20, 768, trec_output_file_path) + \ No newline at end of file diff --git a/vectordb_benchmark/pgvector_faiss_index_adaptor.py b/vectordb_benchmark/pgvector_faiss_index_adaptor.py new file mode 100644 index 000000000..c7c066eeb --- /dev/null +++ b/vectordb_benchmark/pgvector_faiss_index_adaptor.py @@ -0,0 +1,106 @@ +import faiss_index_adaptor +import psycopg2 +import pandas as pd +import numpy as np +import time + +class PGVectorFaissIndexAdaptor(faiss_index_adaptor.VectorDBFaissIndexAdaptor): + def __init__(self, index_name, DBConfig): + super().__init__(index_name, DBConfig) + self.con = None + + def initialize_database_and_table(self, table_name, DBConfig, vector_size): + # connect to the database + conn = psycopg2.connect( + dbname=DBConfig['dbname'], + user=DBConfig['user'], + password=DBConfig['password'], + host=DBConfig['host'], + port=DBConfig['port'] + ) + cur = conn.cursor() + + # Create documents table + cur.execute(f""" + CREATE TABLE {table_name} ( + id INT, + vector VECTOR({vector_size}) + ) + """) + conn.commit() + self.con = conn + self.cur = cur + print(f"created table {table_name}") + + def construct_index(self, table_name, metric): + start_time = time.time() + self.cur.execute(f"CREATE INDEX ON {table_name} USING HNSW (vector vector_{metric}_ops);") + end_time = time.time() + print(f"Index constructed for {table_name} using {metric} metric in {end_time - start_time} seconds") + + def insert_vector_map_into_table(self, table_name, metric): + # Convert the numpy array to a list before inserting + insert_data = [(key, vector.tolist()) for key, vector in self.vector_map.items()] + + # Execute the SQL command with the list data + self.cur.executemany( + f"INSERT INTO {table_name} (id, vector) VALUES (%s, %s::vector)", + insert_data + ) + self.con.commit() # Use `conn.commit()` to commit the transaction + + def get_connection(self): + return self.con + + # close the connection + def close(self): + self.con.close() + + def run_benchmark(self, table_name, metric, K, vector_size, trec_file_path, query_vector_map=None): + print(f"running benchmark for {table_name} with metric {metric}") + with open(trec_file_path, 'w') as trec_file: + count = 0 + query_times = [] + sql_query = "" + for query_id, query_vector in query_vector_map.items(): + # Select appropriate SQL query based on the metric + if metric == 'l2sq': + sql_query = f"SELECT id, vector <-> %s::vector AS score FROM {table_name} ORDER BY score LIMIT %s" + elif metric == 'ip': + sql_query = f"SELECT id, (vector <#> %s::vector) * -1 AS score FROM {table_name} ORDER BY score DESC LIMIT %s" + elif metric == 'cosine': + sql_query = f"SELECT id, 1 - (vector <=> %s::vector) AS score FROM {table_name} ORDER BY score DESC LIMIT %s" + + # time the execution + start_time = time.time() + self.cur.execute(sql_query, (query_vector.tolist(), K)) # Execute the query + results = self.cur.fetchall() # Fetch all results from the executed query + end_time = time.time() + query_time = end_time - start_time + query_times.append(query_time) + + # Write results in TREC format + for rank, (doc_id, score) in enumerate(results, start=1): + trec_file.write(f"{query_id} Q0 {doc_id} {rank} {score} PGVector\n") + count += 1 + if count % 100 == 0: + print(f"processed {count} queries") + + print(f"TREC results written to {trec_file_path}") + ans = self.run_trec_eval(trec_file_path) + # Aggregate statistics + total_time = sum(query_times) + mean_time = np.mean(query_times) + variance_time = np.var(query_times) + min_time = min(query_times) + max_time = max(query_times) + # create a file to store results + with open(f"{table_name}_benchmark_results.txt", "w") as f: + f.write(f"Total time: {total_time}\n") + f.write(f"Mean time: {mean_time}\n") + f.write(f"Variance time: {variance_time}\n") + f.write(f"Min time: {min_time}\n") + f.write(f"Max time: {max_time}\n") + f.write(f"TREC eval output: {ans}\n") + return total_time, mean_time, variance_time, min_time, max_time + diff --git a/vectordb_benchmark/run_benchmark.py b/vectordb_benchmark/run_benchmark.py new file mode 100644 index 000000000..3745ba603 --- /dev/null +++ b/vectordb_benchmark/run_benchmark.py @@ -0,0 +1,44 @@ +import faiss_vector_extractor +import duckdb_faiss_index_adaptor +import pgvector_faiss_index_adaptor +import faiss_index_adaptor +import argparse +from faiss_vector_extractor import run_benchmark, run_benchmark_on_file + +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = "trec_dot_product_output.txt" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='FAISS Vector DB Index Constructor') + parser.add_argument('--index_name', type=str, required=True, help='name of the FAISS index file') + parser.add_argument('--metric', type=str, required=True, help='metric of the FAISS index') + parser.add_argument('--table_name', type=str, required=True, help='name of the table to store the vectors') + parser.add_argument('--query_index_path', type=str, required=True, help='optional, if given, run benchmark on the query index') + parser.add_argument('--db_type', type=str, required=True, help='type of the database') + parser.add_argument('--file_path', type=str, required=False, help='optional, if given, create hnsw index on the file') + + args = parser.parse_args() + + if args.db_type == 'duckdb': + DBConfig = { + 'memory_limit': '100GB' + } + adaptor = duckdb_faiss_index_adaptor.DuckDBVectorDBFaissIndexAdaptor(args.index_name, DBConfig) + elif args.db_type == 'pgvector': + DBConfig = { + 'dbname': 'x59song', + 'user': 'x59song', + 'password': '123456', + 'host': 'localhost', + 'port': '5432' + } + adaptor = pgvector_faiss_index_adaptor.PGVectorFaissIndexAdaptor(args.index_name, DBConfig) + + if args.file_path: + run_benchmark_on_file(TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, args.metric, args.query_index_path, adaptor, args.table_name) + else: + adaptor.extract_vectors_and_construct_index(args.table_name, args.metric) + run_benchmark(TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, args.metric, args.query_index_path, adaptor, args.table_name) + + + From 4af3d0316c9e3f471dce745355c416cedd28af2a Mon Sep 17 00:00:00 2001 From: song Date: Wed, 11 Sep 2024 22:34:19 -0400 Subject: [PATCH 10/21] updated doc, included instruction on running full msmarco dataset --- vectordb_benchmark/benchmark_msmarco.sh | 5 +- vectordb_benchmark/duckdb_db_config.txt | 1 + vectordb_benchmark/experiment_vectordbs.md | 93 ++++++++++++++++++++++ vectordb_benchmark/faiss_index_adaptor.py | 30 +++---- vectordb_benchmark/pgvector_db_config.txt | 5 ++ vectordb_benchmark/run_benchmark.py | 16 ++-- 6 files changed, 123 insertions(+), 27 deletions(-) create mode 100644 vectordb_benchmark/duckdb_db_config.txt create mode 100644 vectordb_benchmark/experiment_vectordbs.md create mode 100644 vectordb_benchmark/pgvector_db_config.txt diff --git a/vectordb_benchmark/benchmark_msmarco.sh b/vectordb_benchmark/benchmark_msmarco.sh index 0abb31495..3fdb54957 100755 --- a/vectordb_benchmark/benchmark_msmarco.sh +++ b/vectordb_benchmark/benchmark_msmarco.sh @@ -1,6 +1,7 @@ -python3 vectordb_benchmark/run_benchmark.py \ +python3 ./run_benchmark.py \ --index_name='msmarco-v1-passage.bge-base-en-v1.5' \ --table_name='msmarco' \ --metric='ip' \ --query_index_path='/store/scratch/x59song/Research/pyserini/indexes/msmarco-dev.bge-base-en-v1.5' \ ---db_type='pgvector' \ +--db_config_file='duckdb_db_config.txt' \ +--db_type='duckdb' \ diff --git a/vectordb_benchmark/duckdb_db_config.txt b/vectordb_benchmark/duckdb_db_config.txt new file mode 100644 index 000000000..598a10fb2 --- /dev/null +++ b/vectordb_benchmark/duckdb_db_config.txt @@ -0,0 +1 @@ +memory_limit:100GB \ No newline at end of file diff --git a/vectordb_benchmark/experiment_vectordbs.md b/vectordb_benchmark/experiment_vectordbs.md new file mode 100644 index 000000000..298b334c8 --- /dev/null +++ b/vectordb_benchmark/experiment_vectordbs.md @@ -0,0 +1,93 @@ +# Overview +This document contains instructions for setting up and running benchmarks for querying MSMarco and NFCorpus using DuckDB and PGVector. + +# Prerequisites +- Pyserini Setup +- DuckDB 0.10.0+ installed +- PostgreSQL 14.0+ preferred +- PGVector 0.6.0+ installed + +# Database setup +## DuckDB +Duckdb is relatively easy to set up, as it is an in-memory database that can be embedded into a process. Therefore, a simple +`pip install duckdb` will suffice. Then, you should supply a config file, `duckdb_db_config.txt`, to specify the database configuration. The only parameter you need to tune is how much memory you want to allocate to the database. +``` +memory_limit:100GB +``` +Then, you can simply run the following, to run the benchmark. + +``` +$ python3 vectordb_benchmark/run_benchmark.py \ + --index_name='msmarco-v1-passage.bge-base-en-v1.5' \ + --table_name='msmarco' \ + --metric='ip' \ + --query_index_path='pyserini/indexes/msmarco-dev.bge-base-en-v1.5' \ + --db_type='duckdb' + --db_config_file='duckdb_db_config.txt' \ +``` +The entire process may take over a day to complete, depending on your hardware set up. This code will download the index, extract the embedded vectors of the index, build the table in duckdb and run the benchmark. + +## PGVector +PGVector is an extension of PostgreSQL, so you will need to install PostgreSQL and PGVector. Here, it is assumed that you have a PostgreSQL server running on your local machine, and you have the PGVector extension installed and enabled in PostgreSQL. Make sure you supply the correct database configuration in the `db_config.txt` file. For example: + +``` +dbname: main_db +user: main_user +password: 123456 +host: localhost +port: 5432 +``` + +Then, you can run the benchmark by running the following command. + +``` +$ python3 vectordb_benchmark/run_benchmark.py \ + --index_name='msmarco-v1-passage.bge-base-en-v1.5' \ + --table_name='msmarco' \ + --metric='ip' \ + --query_index_path='pyserini/indexes/msmarco-dev.bge-base-en-v1.5' \ + --db_type='pgvector' \ + --db_config_file='pgvector_db_config.txt' \ +``` + +# Encoding and Benchmarking NFCorpus using DuckDB and PGVector + +This document contains instructions for encoding and benchmarking NFCorpus using DuckDB and PGVector. + +## 1. Encode the Corpus +Create a directory for document embeddings and encode the corpus using the specified encoder. + +```bash +mkdir indexes/non-faiss-nfcorpus/documents +python -m pyserini.encode \ + input --corpus collections/nfcorpus/corpus.jsonl \ + --fields title text \ + output --embeddings indexes/non-faiss-nfcorpus/documents \ + encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ + --device cpu \ + --pooling mean \ + --fields title text \ + --batch 32 +``` + +## 2. Encode the Queries +Create a directory for query embeddings and encode the queries using the specified encoder. + +```bash +mkdir indexes/non-faiss-nfcorpus/queries +python -m pyserini.encode \ + input --corpus collections/nfcorpus/queries.jsonl \ + --fields title text \ + output --embeddings indexes/non-faiss-nfcorpus/queries \ + encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ + --device cpu \ + --pooling mean \ + --fields title text \ + --batch 32 +``` + +## 3. Run Benchmarks + +```bash +python3 benchmark_duckdb.py +python3 benchmark_pgvector.py diff --git a/vectordb_benchmark/faiss_index_adaptor.py b/vectordb_benchmark/faiss_index_adaptor.py index 08eb06fcb..367c18b39 100644 --- a/vectordb_benchmark/faiss_index_adaptor.py +++ b/vectordb_benchmark/faiss_index_adaptor.py @@ -15,21 +15,21 @@ def __init__(self, index_name, DBConfig): def extract_vectors_and_construct_index(self, table_name, metric, extract_all_vectors=False, vector_size=768): self.initialize_database_and_table(table_name, self.DBConfig, vector_size) # if extract all vectors, extract all, otherwise extract by batch via a while loop - # if extract_all_vectors: - # self.vector_map = self.extractor.extract_all_vectors() - # self.insert_vector_map_into_table(table_name, metric) - # else: - # startid = 0 - # batch_size = 100000 - # self.extractor.load_index() - # while startid < self.extractor.index.ntotal: - # # time extraction - # start_time = time.time() - # self.vector_map = self.extractor.extract_one_batch_of_vectors(startid, batch_size) - # end_time = time.time() - # print(f"Extracted {batch_size} vectors in {end_time - start_time} seconds") - # self.insert_vector_map_into_table(table_name, metric) - # startid += batch_size + if extract_all_vectors: + self.vector_map = self.extractor.extract_all_vectors() + self.insert_vector_map_into_table(table_name, metric) + else: + startid = 0 + batch_size = 100000 + self.extractor.load_index() + while startid < self.extractor.index.ntotal: + # time extraction + start_time = time.time() + self.vector_map = self.extractor.extract_one_batch_of_vectors(startid, batch_size) + end_time = time.time() + print(f"Extracted {batch_size} vectors in {end_time - start_time} seconds") + self.insert_vector_map_into_table(table_name, metric) + startid += batch_size self.construct_index(table_name, metric) def insert_vector_map_into_table(self, table_name, metric): diff --git a/vectordb_benchmark/pgvector_db_config.txt b/vectordb_benchmark/pgvector_db_config.txt new file mode 100644 index 000000000..2f28f9ffb --- /dev/null +++ b/vectordb_benchmark/pgvector_db_config.txt @@ -0,0 +1,5 @@ +username:main_user +password:123456 +host:localhost +port:5432 +database:main_database \ No newline at end of file diff --git a/vectordb_benchmark/run_benchmark.py b/vectordb_benchmark/run_benchmark.py index 3745ba603..c8b58e7b0 100644 --- a/vectordb_benchmark/run_benchmark.py +++ b/vectordb_benchmark/run_benchmark.py @@ -15,23 +15,19 @@ parser.add_argument('--table_name', type=str, required=True, help='name of the table to store the vectors') parser.add_argument('--query_index_path', type=str, required=True, help='optional, if given, run benchmark on the query index') parser.add_argument('--db_type', type=str, required=True, help='type of the database') + parser.add_argument('--db_config_file', type=str, required=True, help='config of the database, separated by end of line, key:value') parser.add_argument('--file_path', type=str, required=False, help='optional, if given, create hnsw index on the file') args = parser.parse_args() + # parse the db_config_file + with open(args.db_config_file, 'r') as f: + db_config = f.readlines() + DBConfig = {line.strip().split(':')[0]: line.strip().split(':')[1] for line in db_config} + if args.db_type == 'duckdb': - DBConfig = { - 'memory_limit': '100GB' - } adaptor = duckdb_faiss_index_adaptor.DuckDBVectorDBFaissIndexAdaptor(args.index_name, DBConfig) elif args.db_type == 'pgvector': - DBConfig = { - 'dbname': 'x59song', - 'user': 'x59song', - 'password': '123456', - 'host': 'localhost', - 'port': '5432' - } adaptor = pgvector_faiss_index_adaptor.PGVectorFaissIndexAdaptor(args.index_name, DBConfig) if args.file_path: From 82e352785421fd6b27e7a5e14c3aca53448a2abc Mon Sep 17 00:00:00 2001 From: song Date: Wed, 11 Sep 2024 22:36:45 -0400 Subject: [PATCH 11/21] updated instruction --- vectordb_benchmark/experiment_vectordbs.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vectordb_benchmark/experiment_vectordbs.md b/vectordb_benchmark/experiment_vectordbs.md index 298b334c8..c775a66af 100644 --- a/vectordb_benchmark/experiment_vectordbs.md +++ b/vectordb_benchmark/experiment_vectordbs.md @@ -50,6 +50,8 @@ $ python3 vectordb_benchmark/run_benchmark.py \ --db_config_file='pgvector_db_config.txt' \ ``` +Note that after one run, your postgresql will contain the table data, so you may want to drop the table after running the benchmark. Later, we will add an option to skip table creation and index building, so that you can run the benchmark multiple times without having to re-create the table and index every time. + # Encoding and Benchmarking NFCorpus using DuckDB and PGVector This document contains instructions for encoding and benchmarking NFCorpus using DuckDB and PGVector. From 744e4ec55df6c4790d67a32c06d4a048ae75a9ff Mon Sep 17 00:00:00 2001 From: song Date: Wed, 11 Sep 2024 22:57:01 -0400 Subject: [PATCH 12/21] discarded unneeded files --- .gitignore | 6 +++++- duckdb_in_memory.py | 19 ------------------- duckdb_server.py | 17 ----------------- instructions.md | 40 ---------------------------------------- 4 files changed, 5 insertions(+), 77 deletions(-) delete mode 100644 duckdb_in_memory.py delete mode 100644 duckdb_server.py delete mode 100644 instructions.md diff --git a/.gitignore b/.gitignore index b4dd91734..5aa4c1db4 100644 --- a/.gitignore +++ b/.gitignore @@ -8,8 +8,9 @@ collections/* indexes/* .vscode/ venv/ -*.txt *.duckdb +*trec_dot_product* +*msmarco_benchmark_results* # build directories from `python3 setup.py sdist bdist_wheel` build/ @@ -22,6 +23,9 @@ runs/ # logs should also be ignored logs/ +# tools +tools/ + # binaries should also be ignored bin/* lib* diff --git a/duckdb_in_memory.py b/duckdb_in_memory.py deleted file mode 100644 index dd88cc915..000000000 --- a/duckdb_in_memory.py +++ /dev/null @@ -1,19 +0,0 @@ -import duckdb -import faiss_vector_extractor - -# Open the file-based DuckDB database -file_con = duckdb.connect('my_database.duckdb') - -# Create an in-memory DuckDB database -mem_con = duckdb.connect(database=':memory:') - -# Extract data from the file-based msmarco table into a Pandas DataFrame -df = file_con.execute("SELECT * FROM msmarco").fetchdf() - -# Register the DataFrame in the in-memory DuckDB database -mem_con.register('msmarco', df) - -# Now you can create the HNSW index on the msmarco table in the in-memory database -mem_con.execute(f"CREATE INDEX hnsw_idx ON msmarco USING HNSW(vector) WITH (metric = 'ip')") - -# Continue with your operations... diff --git a/duckdb_server.py b/duckdb_server.py deleted file mode 100644 index 49976f19e..000000000 --- a/duckdb_server.py +++ /dev/null @@ -1,17 +0,0 @@ -import duckdb -from flask import Flask, request, jsonify - -app = Flask(__name__) -con = duckdb.connect('my_database.duckdb') - -@app.route('/query', methods=['POST']) -def query_duckdb(): - query = request.json.get('query') - try: - result = con.execute(query).fetchdf() - return result.to_json(orient='split') - except Exception as e: - return jsonify({'error': str(e)}), 400 - -if __name__ == '__main__': - app.run(port=5000) diff --git a/instructions.md b/instructions.md deleted file mode 100644 index 989e6390d..000000000 --- a/instructions.md +++ /dev/null @@ -1,40 +0,0 @@ -# Encoding and Benchmarking Process - -## 1. Encode the Corpus -Create a directory for document embeddings and encode the corpus using the specified encoder. - -```bash -mkdir indexes/non-faiss-nfcorpus/documents -python -m pyserini.encode \ - input --corpus collections/nfcorpus/corpus.jsonl \ - --fields title text \ - output --embeddings indexes/non-faiss-nfcorpus/documents \ - encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ - --device cpu \ - --pooling mean \ - --fields title text \ - --batch 32 -``` - -## 2. Encode the Queries -Create a directory for query embeddings and encode the queries using the specified encoder. - -```bash -mkdir indexes/non-faiss-nfcorpus/queries -python -m pyserini.encode \ - input --corpus collections/nfcorpus/queries.jsonl \ - --fields title text \ - output --embeddings indexes/non-faiss-nfcorpus/queries \ - encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ - --device cpu \ - --pooling mean \ - --fields title text \ - --batch 32 -``` - -## 3. Run Benchmarks - -```bash -python3 benchmark_duckdb.py -python3 benchmark_pgvector.py - From 894f67bd33fcee45d338e3cf686525ece268a997 Mon Sep 17 00:00:00 2001 From: song Date: Sat, 14 Sep 2024 11:20:01 -0400 Subject: [PATCH 13/21] addressed comments by reorganizing files and adding .gitkeep --- collections/.gitkeep | 0 .../experiment_vectordbs.md | 64 +++++++++++++++++-- .../benchmark_duckdb_nfcorpus.py | 0 .../benchmark_msmarco_duckdb.sh | 2 +- .../benchmark_msmarco_pgvector.sh | 9 +++ .../benchmark_pg_vector_nfcorpus.py | 0 .../vectordb_benchmark}/duckdb_db_config.txt | 0 .../duckdb_faiss_index_adaptor.py | 0 .../vectordb_benchmark}/evaluate_trec.py | 4 +- .../faiss_index_adaptor.py | 0 .../faiss_vector_extractor.py | 0 .../pgvector_db_config.txt | 0 .../pgvector_faiss_index_adaptor.py | 0 .../vectordb_benchmark}/run_benchmark.py | 0 vectordb_benchmark/encode_msmarco.sh | 13 ---- 15 files changed, 69 insertions(+), 23 deletions(-) create mode 100644 collections/.gitkeep rename {vectordb_benchmark => docs}/experiment_vectordbs.md (64%) rename vectordb_benchmark/benchmark_duckdb.py => scripts/vectordb_benchmark/benchmark_duckdb_nfcorpus.py (100%) rename vectordb_benchmark/benchmark_msmarco.sh => scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh (64%) create mode 100644 scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh rename vectordb_benchmark/benchmark_pg_vector.py => scripts/vectordb_benchmark/benchmark_pg_vector_nfcorpus.py (100%) rename {vectordb_benchmark => scripts/vectordb_benchmark}/duckdb_db_config.txt (100%) rename {vectordb_benchmark => scripts/vectordb_benchmark}/duckdb_faiss_index_adaptor.py (100%) rename {vectordb_benchmark => scripts/vectordb_benchmark}/evaluate_trec.py (58%) rename {vectordb_benchmark => scripts/vectordb_benchmark}/faiss_index_adaptor.py (100%) rename {vectordb_benchmark => scripts/vectordb_benchmark}/faiss_vector_extractor.py (100%) rename {vectordb_benchmark => scripts/vectordb_benchmark}/pgvector_db_config.txt (100%) rename {vectordb_benchmark => scripts/vectordb_benchmark}/pgvector_faiss_index_adaptor.py (100%) rename {vectordb_benchmark => scripts/vectordb_benchmark}/run_benchmark.py (100%) delete mode 100644 vectordb_benchmark/encode_msmarco.sh diff --git a/collections/.gitkeep b/collections/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/vectordb_benchmark/experiment_vectordbs.md b/docs/experiment_vectordbs.md similarity index 64% rename from vectordb_benchmark/experiment_vectordbs.md rename to docs/experiment_vectordbs.md index c775a66af..37dfb20a3 100644 --- a/vectordb_benchmark/experiment_vectordbs.md +++ b/docs/experiment_vectordbs.md @@ -3,11 +3,13 @@ This document contains instructions for setting up and running benchmarks for qu # Prerequisites - Pyserini Setup -- DuckDB 0.10.0+ installed -- PostgreSQL 14.0+ preferred -- PGVector 0.6.0+ installed # Database setup +First, activate a Conda environment, for example your pyserini environment. + +```bash +conda activate pyserini +``` ## DuckDB Duckdb is relatively easy to set up, as it is an in-memory database that can be embedded into a process. Therefore, a simple `pip install duckdb` will suffice. Then, you should supply a config file, `duckdb_db_config.txt`, to specify the database configuration. The only parameter you need to tune is how much memory you want to allocate to the database. @@ -27,8 +29,56 @@ $ python3 vectordb_benchmark/run_benchmark.py \ ``` The entire process may take over a day to complete, depending on your hardware set up. This code will download the index, extract the embedded vectors of the index, build the table in duckdb and run the benchmark. -## PGVector -PGVector is an extension of PostgreSQL, so you will need to install PostgreSQL and PGVector. Here, it is assumed that you have a PostgreSQL server running on your local machine, and you have the PGVector extension installed and enabled in PostgreSQL. Make sure you supply the correct database configuration in the `db_config.txt` file. For example: +# PGVector +PGVector is an extension of PostgreSQL, so you will need to install both PostgreSQL and PGVector. + +## PostgreSQL and pgvector Installation Guide (Conda) + +This guide provides step-by-step instructions on how to install PostgreSQL using Conda and manually install the `pgvector` extension. By following these instructions, you'll be able to set up PostgreSQL and create the `pgvector` extension successfully. + +### Install PostgreSQL +```bash +conda install -c conda-forge postgresql +``` + +### Initialize and start the database +```bash +initdb -D /path/to/your/database_directory +pg_ctl -D /path/to/your/database_directory start +``` + +### Install and activate PGVector +To manually install pgvector, first install the necessary build tools (gcc and make) using Conda: +```bash +conda install -c conda-forge gcc_linux-64 make +``` + +```bash +git clone https://github.com/pgvector/pgvector.git +cd pgvector +make PG_CONFIG=$(which pg_config) +make install +``` + +After the installation, verify that the pgvector.control file and library were installed correctly: +```bash +ls $(pg_config --sharedir)/extension/pgvector.control +ls $(pg_config --pkglibdir)/vector.so +``` +If both files are present, the installation was successful. + +Restart PostgreSQL to enable the pgvector extension: +```bash +pg_ctl -D /path/to/your/database_directory stop +pg_ctl -D /path/to/your/database_directory start +``` + +```bash +psql postgres +CREATE EXTENSION pgvector; +``` + +Now that you have the PGVector extension installed and enabled in PostgreSQL. You can start running the benchmark, but first, make sure you supply the correct database configuration in the `pgvector_db_config.txt` file. For example: ``` dbname: main_db @@ -91,5 +141,5 @@ python -m pyserini.encode \ ## 3. Run Benchmarks ```bash -python3 benchmark_duckdb.py -python3 benchmark_pgvector.py +python3 benchmark_duckdb_nfcorpus.py +python3 benchmark_pgvector_nfcorpus.py diff --git a/vectordb_benchmark/benchmark_duckdb.py b/scripts/vectordb_benchmark/benchmark_duckdb_nfcorpus.py similarity index 100% rename from vectordb_benchmark/benchmark_duckdb.py rename to scripts/vectordb_benchmark/benchmark_duckdb_nfcorpus.py diff --git a/vectordb_benchmark/benchmark_msmarco.sh b/scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh similarity index 64% rename from vectordb_benchmark/benchmark_msmarco.sh rename to scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh index 3fdb54957..bcb8a21c0 100755 --- a/vectordb_benchmark/benchmark_msmarco.sh +++ b/scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh @@ -2,6 +2,6 @@ python3 ./run_benchmark.py \ --index_name='msmarco-v1-passage.bge-base-en-v1.5' \ --table_name='msmarco' \ --metric='ip' \ ---query_index_path='/store/scratch/x59song/Research/pyserini/indexes/msmarco-dev.bge-base-en-v1.5' \ +--query_index_path='../../indexes/msmarco-dev.bge-base-en-v1.5' \ --db_config_file='duckdb_db_config.txt' \ --db_type='duckdb' \ diff --git a/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh b/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh new file mode 100644 index 000000000..f9610425b --- /dev/null +++ b/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +python3 ./run_benchmark.py \ +--index_name='msmarco-v1-passage.bge-base-en-v1.5' \ +--table_name='msmarco' \ +--metric='ip' \ +--query_index_path='../../indexes/msmarco-dev.bge-base-en-v1.5' \ +--db_type='pgvector' \ +--db_config_file='pgvector_db_config.txt' \ No newline at end of file diff --git a/vectordb_benchmark/benchmark_pg_vector.py b/scripts/vectordb_benchmark/benchmark_pg_vector_nfcorpus.py similarity index 100% rename from vectordb_benchmark/benchmark_pg_vector.py rename to scripts/vectordb_benchmark/benchmark_pg_vector_nfcorpus.py diff --git a/vectordb_benchmark/duckdb_db_config.txt b/scripts/vectordb_benchmark/duckdb_db_config.txt similarity index 100% rename from vectordb_benchmark/duckdb_db_config.txt rename to scripts/vectordb_benchmark/duckdb_db_config.txt diff --git a/vectordb_benchmark/duckdb_faiss_index_adaptor.py b/scripts/vectordb_benchmark/duckdb_faiss_index_adaptor.py similarity index 100% rename from vectordb_benchmark/duckdb_faiss_index_adaptor.py rename to scripts/vectordb_benchmark/duckdb_faiss_index_adaptor.py diff --git a/vectordb_benchmark/evaluate_trec.py b/scripts/vectordb_benchmark/evaluate_trec.py similarity index 58% rename from vectordb_benchmark/evaluate_trec.py rename to scripts/vectordb_benchmark/evaluate_trec.py index 6f3bcaddd..714954d0d 100644 --- a/vectordb_benchmark/evaluate_trec.py +++ b/scripts/vectordb_benchmark/evaluate_trec.py @@ -3,7 +3,7 @@ command = [ "python", "-m", "pyserini.eval.trec_eval", "-c", "-M", "10", "-m", "recip_rank", - "../collections/msmarco-passage/qrels.dev.small.trec", - '../trec_dot_product_output.txt' + "../../collections/msmarco-passage/qrels.dev.small.trec", + '../../trec_dot_product_output.txt' ] subprocess.run(command) \ No newline at end of file diff --git a/vectordb_benchmark/faiss_index_adaptor.py b/scripts/vectordb_benchmark/faiss_index_adaptor.py similarity index 100% rename from vectordb_benchmark/faiss_index_adaptor.py rename to scripts/vectordb_benchmark/faiss_index_adaptor.py diff --git a/vectordb_benchmark/faiss_vector_extractor.py b/scripts/vectordb_benchmark/faiss_vector_extractor.py similarity index 100% rename from vectordb_benchmark/faiss_vector_extractor.py rename to scripts/vectordb_benchmark/faiss_vector_extractor.py diff --git a/vectordb_benchmark/pgvector_db_config.txt b/scripts/vectordb_benchmark/pgvector_db_config.txt similarity index 100% rename from vectordb_benchmark/pgvector_db_config.txt rename to scripts/vectordb_benchmark/pgvector_db_config.txt diff --git a/vectordb_benchmark/pgvector_faiss_index_adaptor.py b/scripts/vectordb_benchmark/pgvector_faiss_index_adaptor.py similarity index 100% rename from vectordb_benchmark/pgvector_faiss_index_adaptor.py rename to scripts/vectordb_benchmark/pgvector_faiss_index_adaptor.py diff --git a/vectordb_benchmark/run_benchmark.py b/scripts/vectordb_benchmark/run_benchmark.py similarity index 100% rename from vectordb_benchmark/run_benchmark.py rename to scripts/vectordb_benchmark/run_benchmark.py diff --git a/vectordb_benchmark/encode_msmarco.sh b/vectordb_benchmark/encode_msmarco.sh deleted file mode 100644 index dfc8ac59f..000000000 --- a/vectordb_benchmark/encode_msmarco.sh +++ /dev/null @@ -1,13 +0,0 @@ -python3 tools/scripts/msmarco-passage/encode_queries.py \ - --encoder=bge-base-en-v1.5 \ - --input=collections/msmarco-passage/queries.dev.small.tsv \ - --output=collections/faiss-queries/msmarco-passage/queries.pkl - -python -m pyserini.encode \ - input --corpus collections/faiss-queries/msmarco-passage/queries.jsonl \ - output --embeddings indexes/msmarco-dev.bge-base-en-v1.5 \ - --to-faiss \ - encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ - --device cpu \ - --pooling mean \ - --batch 32 \ No newline at end of file From 410ad2e3805a0bae3385a40bc2fa3fe1bbac4811 Mon Sep 17 00:00:00 2001 From: song Date: Sat, 14 Sep 2024 11:26:20 -0400 Subject: [PATCH 14/21] added git keep content --- collections/.gitkeep | 1 + 1 file changed, 1 insertion(+) diff --git a/collections/.gitkeep b/collections/.gitkeep index e69de29bb..b1adcd339 100644 --- a/collections/.gitkeep +++ b/collections/.gitkeep @@ -0,0 +1 @@ +# This is the default directory for document collections. Placeholder so that directory is kept in git. \ No newline at end of file From 0970e93dd5bef3d35a7c987f5103467349426dd2 Mon Sep 17 00:00:00 2001 From: song Date: Sat, 14 Sep 2024 12:18:22 -0400 Subject: [PATCH 15/21] filename change --- ...{benchmark_duckdb_nfcorpus.py => benchmark_nfcorpus_duckdb.py} | 0 ...hmark_pg_vector_nfcorpus.py => benchmark_nfcorpus_pgvector.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename scripts/vectordb_benchmark/{benchmark_duckdb_nfcorpus.py => benchmark_nfcorpus_duckdb.py} (100%) rename scripts/vectordb_benchmark/{benchmark_pg_vector_nfcorpus.py => benchmark_nfcorpus_pgvector.py} (100%) diff --git a/scripts/vectordb_benchmark/benchmark_duckdb_nfcorpus.py b/scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py similarity index 100% rename from scripts/vectordb_benchmark/benchmark_duckdb_nfcorpus.py rename to scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py diff --git a/scripts/vectordb_benchmark/benchmark_pg_vector_nfcorpus.py b/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py similarity index 100% rename from scripts/vectordb_benchmark/benchmark_pg_vector_nfcorpus.py rename to scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py From 168b084ad366c5d129c9fdb3ca3a6a8ba3748735 Mon Sep 17 00:00:00 2001 From: song Date: Sat, 14 Sep 2024 12:23:18 -0400 Subject: [PATCH 16/21] modified instruction file to reflect file name changes --- docs/experiment_vectordbs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/experiment_vectordbs.md b/docs/experiment_vectordbs.md index 37dfb20a3..a0da66ef6 100644 --- a/docs/experiment_vectordbs.md +++ b/docs/experiment_vectordbs.md @@ -141,5 +141,5 @@ python -m pyserini.encode \ ## 3. Run Benchmarks ```bash -python3 benchmark_duckdb_nfcorpus.py -python3 benchmark_pgvector_nfcorpus.py +python3 benchmark_nfcorpus_duckdb.py +python3 benchmark_nfcorpus_pgvector.py From 36a3490252acf6cf1930488f3d4fcfe4b9dc4fa3 Mon Sep 17 00:00:00 2001 From: song Date: Sat, 14 Sep 2024 12:31:47 -0400 Subject: [PATCH 17/21] deleted unneeded files, and cleanup naming --- ...t_vectordbs.md => experiment-vectordbs.md} | 4 +-- msmarco_benchmark.py | 27 ------------------- nfcorpus_results.txt | 15 ----------- .../benchmark_msmarco_duckdb.sh | 2 +- .../benchmark_msmarco_pgvector.sh | 2 -- scripts/vectordb_benchmark/run_benchmark.py | 1 - 6 files changed, 3 insertions(+), 48 deletions(-) rename docs/{experiment_vectordbs.md => experiment-vectordbs.md} (98%) delete mode 100644 msmarco_benchmark.py delete mode 100644 nfcorpus_results.txt diff --git a/docs/experiment_vectordbs.md b/docs/experiment-vectordbs.md similarity index 98% rename from docs/experiment_vectordbs.md rename to docs/experiment-vectordbs.md index a0da66ef6..b0bd1caea 100644 --- a/docs/experiment_vectordbs.md +++ b/docs/experiment-vectordbs.md @@ -24,8 +24,8 @@ $ python3 vectordb_benchmark/run_benchmark.py \ --table_name='msmarco' \ --metric='ip' \ --query_index_path='pyserini/indexes/msmarco-dev.bge-base-en-v1.5' \ - --db_type='duckdb' - --db_config_file='duckdb_db_config.txt' \ + --db_type='duckdb' \ + --db_config_file='duckdb_db_config.txt' ``` The entire process may take over a day to complete, depending on your hardware set up. This code will download the index, extract the embedded vectors of the index, build the table in duckdb and run the benchmark. diff --git a/msmarco_benchmark.py b/msmarco_benchmark.py deleted file mode 100644 index 2720731ca..000000000 --- a/msmarco_benchmark.py +++ /dev/null @@ -1,27 +0,0 @@ -import argparse -import faiss -import faiss_vector_extractor - -TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = "/store/scratch/x59song/trec_dot_product_output.txt" - -def run_benchmark(trec_output_file_path, metric, query_index_path, adaptor): - query_vector_map = load_index_and_docids(query_index_path) - adaptor.run_benchmark(query_vector_map, table_name, metric, 20, 768, trec_output_file_path) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='FAISS Vector DB Index Constructor') - parser.add_argument('--index_name', type=str, required=True, help='name of the FAISS index file') - parser.add_argument('--metric', type=str, required=True, help='metric of the FAISS index') - parser.add_argument('--table_name', type=str, required=True, help='name of the table to store the vectors') - args = parser.parse_args() - - DBConfig = { - 'temp_directory': '/store/scratch/x59song/temp', - 'memory_limit': '50GB' - } - - adaptor = DuckDBVectorDBFaissIndexAdaptor(args.index_name, DBConfig) - adaptor.extract_vectors_and_construct_index(args.table_name, args.metric) - run_benchmark(TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, args.metric, args.index_name, adaptor) - - \ No newline at end of file diff --git a/nfcorpus_results.txt b/nfcorpus_results.txt deleted file mode 100644 index c9008f15f..000000000 --- a/nfcorpus_results.txt +++ /dev/null @@ -1,15 +0,0 @@ -duckdb: -building l2sq index: 0.3682847023010254 0.3575248718261719 0.35877418518066406 -building cosine index: 0.4233689308166504 0.4250659942626953 0.4125690460205078 -building ip index: 0.35698509216308594 0.326251745223999 0.33107995986938477 -l2sq: 19.746002674102783 21.720022916793823 20.766737937927246 19.952106475830078(second run) -cosine: 22.334033727645874 22.69918704032898 22.870506525039673 22.43225622177124(second run) -ip: 20.792579174041748 19.3823139667511 20.307250261306763 20.414534091949463(second run) - -pg_vector: -building l2sq index: 2.4153892993927 2.3378589153289795 2.276991844177246 -building cosine index: 2.4951090812683105 2.369596004486084 2.459275960922241 -building ip index: 2.471719980239868 2.325632095336914 2.4149928092956543 -l2sq: 5.069890260696411 4.91141152381897 4.930738925933838 4.911103963851929(second run) -cosine: 31.49447011947632 31.42801332473755 33.082948207855225 31.616244316101074(second run) -ip: 28.120339155197144 27.629921197891235 30.123175144195557 29.147559881210327(second run) \ No newline at end of file diff --git a/scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh b/scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh index bcb8a21c0..af1c565e9 100755 --- a/scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh +++ b/scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh @@ -3,5 +3,5 @@ python3 ./run_benchmark.py \ --table_name='msmarco' \ --metric='ip' \ --query_index_path='../../indexes/msmarco-dev.bge-base-en-v1.5' \ ---db_config_file='duckdb_db_config.txt' \ --db_type='duckdb' \ +--db_config_file='duckdb_db_config.txt' diff --git a/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh b/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh index f9610425b..aae74197c 100644 --- a/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh +++ b/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh @@ -1,5 +1,3 @@ -#!/bin/bash - python3 ./run_benchmark.py \ --index_name='msmarco-v1-passage.bge-base-en-v1.5' \ --table_name='msmarco' \ diff --git a/scripts/vectordb_benchmark/run_benchmark.py b/scripts/vectordb_benchmark/run_benchmark.py index c8b58e7b0..5f53230d3 100644 --- a/scripts/vectordb_benchmark/run_benchmark.py +++ b/scripts/vectordb_benchmark/run_benchmark.py @@ -16,7 +16,6 @@ parser.add_argument('--query_index_path', type=str, required=True, help='optional, if given, run benchmark on the query index') parser.add_argument('--db_type', type=str, required=True, help='type of the database') parser.add_argument('--db_config_file', type=str, required=True, help='config of the database, separated by end of line, key:value') - parser.add_argument('--file_path', type=str, required=False, help='optional, if given, create hnsw index on the file') args = parser.parse_args() From 3ae4b182e1302cbd0b8e31e7a46829e591291087 Mon Sep 17 00:00:00 2001 From: song Date: Sat, 14 Sep 2024 12:36:17 -0400 Subject: [PATCH 18/21] changed relative file paths for benchmarks --- .../vectordb_benchmark/benchmark_nfcorpus_duckdb.py | 12 ++++++------ .../benchmark_nfcorpus_pgvector.py | 12 ++++++------ scripts/vectordb_benchmark/faiss_index_adaptor.py | 2 +- scripts/vectordb_benchmark/faiss_vector_extractor.py | 2 +- scripts/vectordb_benchmark/run_benchmark.py | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py b/scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py index acbf3d1c6..5fd893835 100644 --- a/scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py +++ b/scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py @@ -5,11 +5,11 @@ import time # Paths to embedding, query, and output files -DOCUMENT_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/documents/embeddings.jsonl' -QUERY_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/queries/embeddings.jsonl' -TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_dot_product.txt' -TREC_COSINE_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_cosine.txt' -TREC_L2SQ_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_l2sq.txt' +DOCUMENT_JSONL_FILE_PATH = '../../indexes/non-faiss-nfcorpus/documents/embeddings.jsonl' +QUERY_JSONL_FILE_PATH = '../../indexes/non-faiss-nfcorpus/queries/embeddings.jsonl' +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_dot_product.txt' +TREC_COSINE_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_cosine.txt' +TREC_L2SQ_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_l2sq.txt' K = 10 # Number of nearest neighbors to retrieve RUN_ID = "DuckDBHNSW" # Identifier for the run @@ -71,7 +71,7 @@ def run_trec_eval(trec_output_file_path): command = [ "python", "-m", "pyserini.eval.trec_eval", "-c", "-m", "ndcg_cut.10", - "collections/nfcorpus/qrels/test.qrels", + "../../collections/nfcorpus/qrels/test.qrels", trec_output_file_path ] print("ndcg@10 for ", trec_output_file_path) diff --git a/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py b/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py index 2cff06e69..092082529 100644 --- a/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py +++ b/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py @@ -5,11 +5,11 @@ import time # Paths to embedding, query, and output files -DOCUMENT_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/documents/embeddings.jsonl' -QUERY_JSONL_FILE_PATH = 'indexes/non-faiss-nfcorpus/queries/embeddings.jsonl' -TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_dot_product.txt' -TREC_COSINE_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_cosine.txt' -TREC_L2SQ_OUTPUT_FILE_PATH = 'runs/.run-non-faiss-nfcorpus-result_l2sq.txt' +DOCUMENT_JSONL_FILE_PATH = '../../indexes/non-faiss-nfcorpus/documents/embeddings.jsonl' +QUERY_JSONL_FILE_PATH = '../../indexes/non-faiss-nfcorpus/queries/embeddings.jsonl' +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_dot_product.txt' +TREC_COSINE_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_cosine.txt' +TREC_L2SQ_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_l2sq.txt' VECTOR_SIZE = 768 K = 10 # Number of nearest neighbors to retrieve RUN_ID = "PostgresHNSW" @@ -67,7 +67,7 @@ def run_trec_eval(trec_output_file_path): command = [ "python", "-m", "pyserini.eval.trec_eval", "-c", "-m", "ndcg_cut.10", - "collections/nfcorpus/qrels/test.qrels", + "../../collections/nfcorpus/qrels/test.qrels", trec_output_file_path ] print("ndcg@10 for ", trec_output_file_path) diff --git a/scripts/vectordb_benchmark/faiss_index_adaptor.py b/scripts/vectordb_benchmark/faiss_index_adaptor.py index 367c18b39..460312e61 100644 --- a/scripts/vectordb_benchmark/faiss_index_adaptor.py +++ b/scripts/vectordb_benchmark/faiss_index_adaptor.py @@ -52,7 +52,7 @@ def run_trec_eval(self, trec_output_file_path): command = [ "python", "-m", "pyserini.eval.trec_eval", "-c", "-M", "10", "-m", "recip_rank", - "collections/msmarco-passage/qrels.dev.small.trec", + "../../collections/msmarco-passage/qrels.dev.small.trec", trec_output_file_path ] return subprocess.run(command) diff --git a/scripts/vectordb_benchmark/faiss_vector_extractor.py b/scripts/vectordb_benchmark/faiss_vector_extractor.py index 73d9cbce1..394280b2b 100644 --- a/scripts/vectordb_benchmark/faiss_vector_extractor.py +++ b/scripts/vectordb_benchmark/faiss_vector_extractor.py @@ -5,7 +5,7 @@ import duckdb import faiss -TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = 'runs/.run-faiss-msmarco-passage-result_dot_product.txt' +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = '../../runs/.run-faiss-msmarco-passage-result_dot_product.txt' class FaissVectorExtractor: def __init__(self, index_name): diff --git a/scripts/vectordb_benchmark/run_benchmark.py b/scripts/vectordb_benchmark/run_benchmark.py index 5f53230d3..8c1d4c49c 100644 --- a/scripts/vectordb_benchmark/run_benchmark.py +++ b/scripts/vectordb_benchmark/run_benchmark.py @@ -5,7 +5,7 @@ import argparse from faiss_vector_extractor import run_benchmark, run_benchmark_on_file -TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = "trec_dot_product_output.txt" +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = "../../trec_dot_product_output.txt" if __name__ == "__main__": From f51ee6a65f578dba7315d85b28f39989595bb7a4 Mon Sep 17 00:00:00 2001 From: song Date: Sun, 22 Sep 2024 19:41:52 -0400 Subject: [PATCH 19/21] cleaned up code --- docs/experiment-vectordbs.md | 34 ++++++++----------- .../benchmark_msmarco_pgvector.sh | 0 .../benchmark_nfcorpus_pgvector.py | 25 ++++++++++---- scripts/vectordb_benchmark/benchmarking.sh | 14 ++++++++ .../init_and_start_postgres.sh | 32 +++++++++++++++++ .../vectordb_benchmark/pgvector_db_config.txt | 4 +-- .../pgvector_faiss_index_adaptor.py | 5 ++- scripts/vectordb_benchmark/run_benchmark.py | 7 ++-- scripts/vectordb_benchmark/run_sql.sh | 29 ++++++++++++++++ scripts/vectordb_benchmark/setup_db.sql | 9 +++++ 10 files changed, 124 insertions(+), 35 deletions(-) mode change 100644 => 100755 scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh create mode 100755 scripts/vectordb_benchmark/benchmarking.sh create mode 100755 scripts/vectordb_benchmark/init_and_start_postgres.sh create mode 100755 scripts/vectordb_benchmark/run_sql.sh create mode 100644 scripts/vectordb_benchmark/setup_db.sql diff --git a/docs/experiment-vectordbs.md b/docs/experiment-vectordbs.md index b0bd1caea..fbf4d8670 100644 --- a/docs/experiment-vectordbs.md +++ b/docs/experiment-vectordbs.md @@ -67,18 +67,14 @@ ls $(pg_config --pkglibdir)/vector.so ``` If both files are present, the installation was successful. -Restart PostgreSQL to enable the pgvector extension: -```bash -pg_ctl -D /path/to/your/database_directory stop -pg_ctl -D /path/to/your/database_directory start -``` +Now, you have to initialize the database, create the vector extension and create a user and database for your experiment. The script +`vectordb_benchmark/init_and_start_postgres.sh` will do this for you. It will ask you for a directory for the database data, and then it will create a database +called `main_database` and a user called `main_user`, and enable the vector extension, so you can simply run: ```bash -psql postgres -CREATE EXTENSION pgvector; +./init_and_start_postgres.sh ``` - -Now that you have the PGVector extension installed and enabled in PostgreSQL. You can start running the benchmark, but first, make sure you supply the correct database configuration in the `pgvector_db_config.txt` file. For example: +Now that you have the PGVector extension installed and enabled in PostgreSQL. You can start running the benchmark, but first, make sure you supply the correct database configuration in the `pgvector_db_config.txt` file. For example, by default: ``` dbname: main_db @@ -99,8 +95,9 @@ $ python3 vectordb_benchmark/run_benchmark.py \ --db_type='pgvector' \ --db_config_file='pgvector_db_config.txt' \ ``` +or simply run the script `benchmark_msmarco_pgvector.sh` -Note that after one run, your postgresql will contain the table data, so you may want to drop the table after running the benchmark. Later, we will add an option to skip table creation and index building, so that you can run the benchmark multiple times without having to re-create the table and index every time. +Note that after one run, your postgresql will contain the table data, the current behaviour is to drop the table and index if they exist when the benchmark started. Later, we will add an option to skip table creation and index building, so that you can run the benchmark multiple times without having to re-create the table and index every time. # Encoding and Benchmarking NFCorpus using DuckDB and PGVector @@ -110,15 +107,14 @@ This document contains instructions for encoding and benchmarking NFCorpus using Create a directory for document embeddings and encode the corpus using the specified encoder. ```bash -mkdir indexes/non-faiss-nfcorpus/documents +mkdir ../../indexes/non-faiss-nfcorpus +mkdir ../../indexes/non-faiss-nfcorpus/documents python -m pyserini.encode \ - input --corpus collections/nfcorpus/corpus.jsonl \ - --fields title text \ - output --embeddings indexes/non-faiss-nfcorpus/documents \ + input --corpus ../../collections/nfcorpus/corpus.jsonl \ + output --embeddings ../../indexes/non-faiss-nfcorpus/documents \ encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ --device cpu \ --pooling mean \ - --fields title text \ --batch 32 ``` @@ -126,15 +122,13 @@ python -m pyserini.encode \ Create a directory for query embeddings and encode the queries using the specified encoder. ```bash -mkdir indexes/non-faiss-nfcorpus/queries +mkdir ../../indexes/non-faiss-nfcorpus/queries python -m pyserini.encode \ - input --corpus collections/nfcorpus/queries.jsonl \ - --fields title text \ - output --embeddings indexes/non-faiss-nfcorpus/queries \ + input --corpus ../../collections/nfcorpus/queries.jsonl \ + output --embeddings ../../indexes/non-faiss-nfcorpus/queries \ encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ --device cpu \ --pooling mean \ - --fields title text \ --batch 32 ``` diff --git a/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh b/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh old mode 100644 new mode 100755 diff --git a/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py b/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py index 092082529..fdfdbfd71 100644 --- a/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py +++ b/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py @@ -18,16 +18,18 @@ def insert_data_into_table(cur, id, content, vector): """Inserts data into the PostgreSQL table.""" cur.execute("INSERT INTO documents (id, content, vector) VALUES (%s, %s, %s)", (id, content, vector)) -def setup_database(): +def setup_database(config): """Sets up the PostgreSQL database and inserts document data.""" + conn = psycopg2.connect( - dbname='main_database', - user='mainuser', - password='password', - host='localhost', - port='5432' + dbname=config['dbname'], + user=config['user'], + password=config['password'], + host=config['host'], + port=config['port'] ) cur = conn.cursor() + cur.execute("DROP TABLE IF EXISTS documents;") # Create documents table cur.execute(f""" @@ -46,6 +48,10 @@ def setup_database(): insert_data_into_table(cur, data['id'], data['contents'], data['vector']) conn.commit() + cur.execute("DROP INDEX IF EXISTS documents_vector_ip_ops_idx;") + cur.execute("DROP INDEX IF EXISTS documents_vector_l2_ops_idx;") + cur.execute("DROP INDEX IF EXISTS documents_vector_cosine_ops_idx;") + # Create indexes with pgvector start_time = time.time() cur.execute("CREATE INDEX ON documents USING HNSW (vector vector_l2_ops);") @@ -116,7 +122,12 @@ def run_benchmark(cur, trec_output_file_path, metric): return total_time, mean_time, variance_time, min_time, max_time if __name__ == "__main__": - cur, conn = setup_database() + # parse the db_config_file + with open('pgvector_db_config.txt', 'r') as f: + db_config = f.readlines() + DBConfig = {line.strip().split(':')[0]: line.strip().split(':')[1] for line in db_config} + + cur, conn = setup_database(DBConfig) # Running the benchmarks print('l2sq: ', run_benchmark(cur, TREC_L2SQ_OUTPUT_FILE_PATH, 'l2sq')) diff --git a/scripts/vectordb_benchmark/benchmarking.sh b/scripts/vectordb_benchmark/benchmarking.sh new file mode 100755 index 000000000..1c6bee549 --- /dev/null +++ b/scripts/vectordb_benchmark/benchmarking.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Create and start a screen session named "benchmarking" +screen -dmS benchmarking +screen -S benchmarking -X stuff "conda activate pyserini\n" + +# Run the benchmark.py script in the background within the "benchmarking" session +screen -S benchmarking -X stuff "./benchmark_msmarco_duckdb.sh\n" + +# Detach from the screen session and return to the main terminal +screen -d benchmarking + +echo "Benchmarking script is running in the background in the 'benchmarking' screen session." +echo "You can reattach to the session later with the command: screen -r benchmarking" diff --git a/scripts/vectordb_benchmark/init_and_start_postgres.sh b/scripts/vectordb_benchmark/init_and_start_postgres.sh new file mode 100755 index 000000000..b54d0640c --- /dev/null +++ b/scripts/vectordb_benchmark/init_and_start_postgres.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Check if data directory is provided +if [ $# -eq 0 ]; then + echo "Please provide the data directory path as an argument." + exit 1 +fi + +DATA_DIR=$1 + +# Create data directory if it doesn't exist +if [ ! -d "$DATA_DIR" ]; then + mkdir -p "$DATA_DIR" + echo "Created data directory: $DATA_DIR" +fi + +# Initialize PostgreSQL database +initdb -D "$DATA_DIR" + +# Start PostgreSQL server +pg_ctl -D "$DATA_DIR" -l logfile start + +# Wait for the server to start +sleep 5 + +# Connect to PostgreSQL and run commands +psql -d postgres -f setup_db.sql + +echo "Database initialization and setup completed." + +# Note: Keep the server running. To stop it later, use: +# pg_ctl -D "$DATA_DIR" stop diff --git a/scripts/vectordb_benchmark/pgvector_db_config.txt b/scripts/vectordb_benchmark/pgvector_db_config.txt index 2f28f9ffb..acf953f72 100644 --- a/scripts/vectordb_benchmark/pgvector_db_config.txt +++ b/scripts/vectordb_benchmark/pgvector_db_config.txt @@ -1,5 +1,5 @@ -username:main_user +user:main_user password:123456 host:localhost port:5432 -database:main_database \ No newline at end of file +dbname:main_database \ No newline at end of file diff --git a/scripts/vectordb_benchmark/pgvector_faiss_index_adaptor.py b/scripts/vectordb_benchmark/pgvector_faiss_index_adaptor.py index c7c066eeb..64cd02f15 100644 --- a/scripts/vectordb_benchmark/pgvector_faiss_index_adaptor.py +++ b/scripts/vectordb_benchmark/pgvector_faiss_index_adaptor.py @@ -19,7 +19,10 @@ def initialize_database_and_table(self, table_name, DBConfig, vector_size): port=DBConfig['port'] ) cur = conn.cursor() - + cur.execute(f"DROP TABLE IF EXISTS {table_name};") + cur.execute(f"DROP INDEX IF EXISTS {table_name}_vector_ip_ops_idx;") + cur.execute(f"DROP INDEX IF EXISTS {table_name}_vector_l2_ops_idx;") + cur.execute(f"DROP INDEX IF EXISTS {table_name}_vector_cosine_ops_idx;") # Create documents table cur.execute(f""" CREATE TABLE {table_name} ( diff --git a/scripts/vectordb_benchmark/run_benchmark.py b/scripts/vectordb_benchmark/run_benchmark.py index 8c1d4c49c..ffdfd46ae 100644 --- a/scripts/vectordb_benchmark/run_benchmark.py +++ b/scripts/vectordb_benchmark/run_benchmark.py @@ -29,11 +29,8 @@ elif args.db_type == 'pgvector': adaptor = pgvector_faiss_index_adaptor.PGVectorFaissIndexAdaptor(args.index_name, DBConfig) - if args.file_path: - run_benchmark_on_file(TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, args.metric, args.query_index_path, adaptor, args.table_name) - else: - adaptor.extract_vectors_and_construct_index(args.table_name, args.metric) - run_benchmark(TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, args.metric, args.query_index_path, adaptor, args.table_name) + adaptor.extract_vectors_and_construct_index(args.table_name, args.metric) + run_benchmark(TREC_DOT_PRODUCT_OUTPUT_FILE_PATH, args.metric, args.query_index_path, adaptor, args.table_name) diff --git a/scripts/vectordb_benchmark/run_sql.sh b/scripts/vectordb_benchmark/run_sql.sh new file mode 100755 index 000000000..a134cdc1a --- /dev/null +++ b/scripts/vectordb_benchmark/run_sql.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Check if all required arguments are provided +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Assign input arguments to variables +SQL_SCRIPT=$1 +USERNAME=$2 +DATABASE=$3 + +# Check if the SQL script file exists +if [ ! -f "$SQL_SCRIPT" ]; then + echo "Error: SQL script file '$SQL_SCRIPT' not found." + exit 1 +fi + +# Execute the SQL script using psql +psql -U "$USERNAME" -d "$DATABASE" -f "$SQL_SCRIPT" -v ON_ERROR_STOP=1 --echo-all + +# Check the exit status of psql +if [ $? -eq 0 ]; then + echo "SQL script executed successfully." +else + echo "Error: SQL script execution failed." + exit 1 +fi \ No newline at end of file diff --git a/scripts/vectordb_benchmark/setup_db.sql b/scripts/vectordb_benchmark/setup_db.sql new file mode 100644 index 000000000..074d872f0 --- /dev/null +++ b/scripts/vectordb_benchmark/setup_db.sql @@ -0,0 +1,9 @@ +CREATE ROLE main_user WITH LOGIN PASSWORD '123456'; +CREATE DATABASE main_database; +\connect main_database +GRANT ALL PRIVILEGES ON DATABASE main_database TO main_user; +GRANT USAGE ON SCHEMA public TO main_user; +GRANT CREATE ON SCHEMA public TO main_user; +CREATE EXTENSION vector; + + From df29a3161413ac6a084718f8977034f158dd11e9 Mon Sep 17 00:00:00 2001 From: song Date: Sun, 29 Sep 2024 15:34:53 -0400 Subject: [PATCH 20/21] Updated the instructions doc to contain set by step guide --- docs/experiment-vectordbs.md | 117 +++++++++++------- .../benchmark_msmarco_duckdb.sh | 2 +- .../benchmark_msmarco_pgvector.sh | 2 +- .../benchmark_nfcorpus_duckdb.py | 10 +- .../benchmark_nfcorpus_pgvector.py | 10 +- scripts/vectordb_benchmark/benchmarking.sh | 2 +- .../init_and_start_postgres.sh | 1 - 7 files changed, 88 insertions(+), 56 deletions(-) diff --git a/docs/experiment-vectordbs.md b/docs/experiment-vectordbs.md index fbf4d8670..328829831 100644 --- a/docs/experiment-vectordbs.md +++ b/docs/experiment-vectordbs.md @@ -1,8 +1,44 @@ # Overview -This document contains instructions for setting up and running benchmarks for querying MSMarco and NFCorpus using DuckDB and PGVector. +We are going to run benchmarks for MSMarco and NFCorpus using DuckDB and PGVector on HNSW indexes. -# Prerequisites -- Pyserini Setup +# MSMarco + +## Data Prep +Similar to the onboarding docs, we must first download and setup the collections and indexes if they are not already downloaded. Except this time, we only need to index the queries, the index itself will be downloaded and extracted by the faiss_index_extractor. First, we need to download the MSMarco Dataset. + +```bash +mkdir collections/msmarco-passage + +wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz -P collections/msmarco-passage + +# Alternative mirror: +# wget https://www.dropbox.com/s/9f54jg2f71ray3b/collectionandqueries.tar.gz -P collections/msmarco-passage + +tar xvfz collections/msmarco-passage/collectionandqueries.tar.gz -C collections/msmarco-passage +``` + +Next, we need to convert the MS MARCO tsv queries into Pyserini's jsonl files (which have one json object per line): + +```bash +python tools/scripts/msmarco/convert_collection_to_jsonl.py \ + --collection-path collections/msmarco-passage/queries.dev.small.tsv \ + --output-folder collections/msmarco-passage/queries_jsonl +``` + +Now, we need to convert the jsonl queries into a faiss index. + +```bash +python -m pyserini.encode \ + input --corpus collections/msmarco-passage/queries_jsonl \ + output --embeddings collections/msmarco-passage/queries_faiss \ + --to-faiss \ + encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ + --device cpu \ + --pooling mean \ + --batch 32 +``` + +Now, after the data is prepared, we can run the benchmark on DuckDB. # Database setup First, activate a Conda environment, for example your pyserini environment. @@ -11,10 +47,9 @@ First, activate a Conda environment, for example your pyserini environment. conda activate pyserini ``` ## DuckDB -Duckdb is relatively easy to set up, as it is an in-memory database that can be embedded into a process. Therefore, a simple -`pip install duckdb` will suffice. Then, you should supply a config file, `duckdb_db_config.txt`, to specify the database configuration. The only parameter you need to tune is how much memory you want to allocate to the database. +Duckdb is relatively easy to set up, as it is an in-memory database that can be embedded into a process. Therefore, you only need to install this database via commandline: ``` -memory_limit:100GB +pip install duckdb ``` Then, you can simply run the following, to run the benchmark. @@ -23,35 +58,26 @@ $ python3 vectordb_benchmark/run_benchmark.py \ --index_name='msmarco-v1-passage.bge-base-en-v1.5' \ --table_name='msmarco' \ --metric='ip' \ - --query_index_path='pyserini/indexes/msmarco-dev.bge-base-en-v1.5' \ + --query_index_path='collections/msmarco-passage/queries_faiss' \ --db_type='duckdb' \ - --db_config_file='duckdb_db_config.txt' + --db_config_file='./scripts/vectordb_benchmark/duckdb_db_config.txt' ``` -The entire process may take over a day to complete, depending on your hardware set up. This code will download the index, extract the embedded vectors of the index, build the table in duckdb and run the benchmark. - -# PGVector -PGVector is an extension of PostgreSQL, so you will need to install both PostgreSQL and PGVector. +The db_config_file should be a text file, it specifies how much memory you would allow DuckDB to allocate, you can modify this file if you want, by default the memory limit is 100GB. The entire process may take over a day to complete, depending on your hardware set up. This code will download the index, extract the embedded vectors of the index, build the table in duckdb and run the benchmark. Alternatively, you can run the script `./scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh` to run the benchmark. -## PostgreSQL and pgvector Installation Guide (Conda) +## PGVector +Now that we have DuckDB experiment finished, we can run the same experiment on PGVector. PGVector is an extension of PostgreSQL, so you will need to install both PostgreSQL and PGVector for this experiment. -This guide provides step-by-step instructions on how to install PostgreSQL using Conda and manually install the `pgvector` extension. By following these instructions, you'll be able to set up PostgreSQL and create the `pgvector` extension successfully. - -### Install PostgreSQL +# Install PostgreSQL ```bash conda install -c conda-forge postgresql ``` -### Initialize and start the database -```bash -initdb -D /path/to/your/database_directory -pg_ctl -D /path/to/your/database_directory start -``` - -### Install and activate PGVector +# Install PGVector To manually install pgvector, first install the necessary build tools (gcc and make) using Conda: ```bash conda install -c conda-forge gcc_linux-64 make ``` +Then, you can clone the pgvector repository, and make and install the extension. ```bash git clone https://github.com/pgvector/pgvector.git @@ -67,13 +93,14 @@ ls $(pg_config --pkglibdir)/vector.so ``` If both files are present, the installation was successful. -Now, you have to initialize the database, create the vector extension and create a user and database for your experiment. The script -`vectordb_benchmark/init_and_start_postgres.sh` will do this for you. It will ask you for a directory for the database data, and then it will create a database -called `main_database` and a user called `main_user`, and enable the vector extension, so you can simply run: - +# Start Database Server +Now that installations are done, you can finally initialize the database, create the vector extension, create a user and database for your experiment and start your postgresql server. The script `vectordb_benchmark/init_and_start_postgres.sh` will do all of these for you, it will initialize the database, create a database called `main_database` and a user called `main_user`, and enable the vector extension. Therefore, you can simply run: ```bash -./init_and_start_postgres.sh +./init_and_start_postgres.sh ~/pgdata ``` +and your postgresql server will be up and running on port 5432. The only argument is the directory for the database data, you can modify this if you want. + +# Run the Benchmark Now that you have the PGVector extension installed and enabled in PostgreSQL. You can start running the benchmark, but first, make sure you supply the correct database configuration in the `pgvector_db_config.txt` file. For example, by default: ``` @@ -91,27 +118,33 @@ $ python3 vectordb_benchmark/run_benchmark.py \ --index_name='msmarco-v1-passage.bge-base-en-v1.5' \ --table_name='msmarco' \ --metric='ip' \ - --query_index_path='pyserini/indexes/msmarco-dev.bge-base-en-v1.5' \ + --query_index_path='collections/msmarco-passage/queries_faiss' \ --db_type='pgvector' \ - --db_config_file='pgvector_db_config.txt' \ + --db_config_file='./scripts/vectordb_benchmark/pgvector_db_config.txt' \ ``` -or simply run the script `benchmark_msmarco_pgvector.sh` +or simply run the script `./scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh` Note that after one run, your postgresql will contain the table data, the current behaviour is to drop the table and index if they exist when the benchmark started. Later, we will add an option to skip table creation and index building, so that you can run the benchmark multiple times without having to re-create the table and index every time. -# Encoding and Benchmarking NFCorpus using DuckDB and PGVector +# NFCorpus -This document contains instructions for encoding and benchmarking NFCorpus using DuckDB and PGVector. +## Data Prep +Similar to the onboarding docs, we must first download the NFCorpus Dataset. + +```bash +wget https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip -P collections +unzip collections/nfcorpus.zip -d collections +``` ## 1. Encode the Corpus Create a directory for document embeddings and encode the corpus using the specified encoder. ```bash -mkdir ../../indexes/non-faiss-nfcorpus -mkdir ../../indexes/non-faiss-nfcorpus/documents +mkdir indexes/faiss-nfcorpus +mkdir indexes/faiss-nfcorpus/documents python -m pyserini.encode \ - input --corpus ../../collections/nfcorpus/corpus.jsonl \ - output --embeddings ../../indexes/non-faiss-nfcorpus/documents \ + input --corpus collections/nfcorpus/corpus.jsonl \ + output --embeddings indexes/faiss-nfcorpus/documents \ encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ --device cpu \ --pooling mean \ @@ -122,10 +155,10 @@ python -m pyserini.encode \ Create a directory for query embeddings and encode the queries using the specified encoder. ```bash -mkdir ../../indexes/non-faiss-nfcorpus/queries +mkdir indexes/faiss-nfcorpus/queries python -m pyserini.encode \ - input --corpus ../../collections/nfcorpus/queries.jsonl \ - output --embeddings ../../indexes/non-faiss-nfcorpus/queries \ + input --corpus collections/nfcorpus/queries.jsonl \ + output --embeddings indexes/faiss-nfcorpus/queries \ encoder --encoder BAAI/bge-base-en-v1.5 --l2-norm \ --device cpu \ --pooling mean \ @@ -135,5 +168,5 @@ python -m pyserini.encode \ ## 3. Run Benchmarks ```bash -python3 benchmark_nfcorpus_duckdb.py -python3 benchmark_nfcorpus_pgvector.py +python3 ./scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py +python3 ./scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py diff --git a/scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh b/scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh index af1c565e9..ffdcf2b78 100755 --- a/scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh +++ b/scripts/vectordb_benchmark/benchmark_msmarco_duckdb.sh @@ -2,6 +2,6 @@ python3 ./run_benchmark.py \ --index_name='msmarco-v1-passage.bge-base-en-v1.5' \ --table_name='msmarco' \ --metric='ip' \ ---query_index_path='../../indexes/msmarco-dev.bge-base-en-v1.5' \ +--query_index_path='../../collections/msmarco-passage/queries_faiss' \ --db_type='duckdb' \ --db_config_file='duckdb_db_config.txt' diff --git a/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh b/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh index aae74197c..a229afa3e 100755 --- a/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh +++ b/scripts/vectordb_benchmark/benchmark_msmarco_pgvector.sh @@ -2,6 +2,6 @@ python3 ./run_benchmark.py \ --index_name='msmarco-v1-passage.bge-base-en-v1.5' \ --table_name='msmarco' \ --metric='ip' \ ---query_index_path='../../indexes/msmarco-dev.bge-base-en-v1.5' \ +--query_index_path='../../collections/msmarco-passage/queries_faiss' \ --db_type='pgvector' \ --db_config_file='pgvector_db_config.txt' \ No newline at end of file diff --git a/scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py b/scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py index 5fd893835..694a75185 100644 --- a/scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py +++ b/scripts/vectordb_benchmark/benchmark_nfcorpus_duckdb.py @@ -5,11 +5,11 @@ import time # Paths to embedding, query, and output files -DOCUMENT_JSONL_FILE_PATH = '../../indexes/non-faiss-nfcorpus/documents/embeddings.jsonl' -QUERY_JSONL_FILE_PATH = '../../indexes/non-faiss-nfcorpus/queries/embeddings.jsonl' -TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_dot_product.txt' -TREC_COSINE_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_cosine.txt' -TREC_L2SQ_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_l2sq.txt' +DOCUMENT_JSONL_FILE_PATH = '../../indexes/faiss-nfcorpus/documents/embeddings.jsonl' +QUERY_JSONL_FILE_PATH = '../../indexes/faiss-nfcorpus/queries/embeddings.jsonl' +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = '../../runs/.run-faiss-nfcorpus-result_dot_product.txt' +TREC_COSINE_OUTPUT_FILE_PATH = '../../runs/.run-faiss-nfcorpus-result_cosine.txt' +TREC_L2SQ_OUTPUT_FILE_PATH = '../../runs/.run-faiss-nfcorpus-result_l2sq.txt' K = 10 # Number of nearest neighbors to retrieve RUN_ID = "DuckDBHNSW" # Identifier for the run diff --git a/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py b/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py index fdfdbfd71..8001489ab 100644 --- a/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py +++ b/scripts/vectordb_benchmark/benchmark_nfcorpus_pgvector.py @@ -5,11 +5,11 @@ import time # Paths to embedding, query, and output files -DOCUMENT_JSONL_FILE_PATH = '../../indexes/non-faiss-nfcorpus/documents/embeddings.jsonl' -QUERY_JSONL_FILE_PATH = '../../indexes/non-faiss-nfcorpus/queries/embeddings.jsonl' -TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_dot_product.txt' -TREC_COSINE_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_cosine.txt' -TREC_L2SQ_OUTPUT_FILE_PATH = '../../runs/.run-non-faiss-nfcorpus-result_l2sq.txt' +DOCUMENT_JSONL_FILE_PATH = '../../indexes/faiss-nfcorpus/documents/embeddings.jsonl' +QUERY_JSONL_FILE_PATH = '../../indexes/faiss-nfcorpus/queries/embeddings.jsonl' +TREC_DOT_PRODUCT_OUTPUT_FILE_PATH = '../../runs/.run-faiss-nfcorpus-result_dot_product.txt' +TREC_COSINE_OUTPUT_FILE_PATH = '../../runs/.run-faiss-nfcorpus-result_cosine.txt' +TREC_L2SQ_OUTPUT_FILE_PATH = '../../runs/.run-faiss-nfcorpus-result_l2sq.txt' VECTOR_SIZE = 768 K = 10 # Number of nearest neighbors to retrieve RUN_ID = "PostgresHNSW" diff --git a/scripts/vectordb_benchmark/benchmarking.sh b/scripts/vectordb_benchmark/benchmarking.sh index 1c6bee549..ff43ac9bc 100755 --- a/scripts/vectordb_benchmark/benchmarking.sh +++ b/scripts/vectordb_benchmark/benchmarking.sh @@ -5,7 +5,7 @@ screen -dmS benchmarking screen -S benchmarking -X stuff "conda activate pyserini\n" # Run the benchmark.py script in the background within the "benchmarking" session -screen -S benchmarking -X stuff "./benchmark_msmarco_duckdb.sh\n" +screen -S benchmarking -X stuff "./benchmark_msmarco_pgvector.sh\n" # Detach from the screen session and return to the main terminal screen -d benchmarking diff --git a/scripts/vectordb_benchmark/init_and_start_postgres.sh b/scripts/vectordb_benchmark/init_and_start_postgres.sh index b54d0640c..ab41cb7f7 100755 --- a/scripts/vectordb_benchmark/init_and_start_postgres.sh +++ b/scripts/vectordb_benchmark/init_and_start_postgres.sh @@ -1,5 +1,4 @@ #!/bin/bash - # Check if data directory is provided if [ $# -eq 0 ]; then echo "Please provide the data directory path as an argument." From f455679f618414de5cf9befaf250036e571259ef Mon Sep 17 00:00:00 2001 From: song Date: Sun, 29 Sep 2024 16:03:16 -0400 Subject: [PATCH 21/21] added instructions on where to find the result --- docs/experiment-vectordbs.md | 3 +++ scripts/vectordb_benchmark/faiss_index_adaptor.py | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/experiment-vectordbs.md b/docs/experiment-vectordbs.md index 328829831..e40c867aa 100644 --- a/docs/experiment-vectordbs.md +++ b/docs/experiment-vectordbs.md @@ -126,6 +126,9 @@ or simply run the script `./scripts/vectordb_benchmark/benchmark_msmarco_pgvecto Note that after one run, your postgresql will contain the table data, the current behaviour is to drop the table and index if they exist when the benchmark started. Later, we will add an option to skip table creation and index building, so that you can run the benchmark multiple times without having to re-create the table and index every time. +# Results +Too view the output of the benchmark, you can check the `msmarco_benchmark_results.txt` file in the `scripts/vectordb_benchmark` folder. It contains the Total time, the mean, variance, min, max time to run a single query on the HNSW index built in the vectordb, as well as the actual ndcg@10 result and verbose output of the trec evaluation tool. The raw trec evaluation output is in the file `trec_dot_product_output.txt` in the top level directory + # NFCorpus ## Data Prep diff --git a/scripts/vectordb_benchmark/faiss_index_adaptor.py b/scripts/vectordb_benchmark/faiss_index_adaptor.py index 460312e61..a440008bb 100644 --- a/scripts/vectordb_benchmark/faiss_index_adaptor.py +++ b/scripts/vectordb_benchmark/faiss_index_adaptor.py @@ -22,7 +22,8 @@ def extract_vectors_and_construct_index(self, table_name, metric, extract_all_ve startid = 0 batch_size = 100000 self.extractor.load_index() - while startid < self.extractor.index.ntotal: + # while startid < self.extractor.index.ntotal: + while startid < 100000: # time extraction start_time = time.time() self.vector_map = self.extractor.extract_one_batch_of_vectors(startid, batch_size) @@ -55,6 +56,8 @@ def run_trec_eval(self, trec_output_file_path): "../../collections/msmarco-passage/qrels.dev.small.trec", trec_output_file_path ] - return subprocess.run(command) + # Capture both stdout and stderr + result = subprocess.run(command, capture_output=True, text=True) + return result