Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions helix-container/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ async-trait = "0.1"
tracing-subscriber = "0.3.20"
tracing = "0.1.41"
dotenvy = "0.15.7"
half = { version = "2.6.0", features = ["std"] }

[features]
dev = ["helix-db/dev-instance"]
1 change: 1 addition & 0 deletions helix-db/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ tempfile = "3.20.0"
paste = "1.0.15"
rayon = "1.11.0"
mimalloc = "0.1.48"
half = { version = "2.4.1", features = ["serde", "std"] }

# compiler dependencies
pest = { version = "2.7", optional = true }
Expand Down
32 changes: 15 additions & 17 deletions helix-db/benches/hnsw_benches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,17 @@ mod tests {
use heed3::{Env, EnvOpenOptions, RoTxn};
use helix_db::{
helix_engine::vector_core::{
VectorData,
hnsw::HNSW,
vector::HVector,
vector_core::{HNSWConfig, VectorCore},
},
utils::tqdm::tqdm,
};
use polars::prelude::*;
use rand::{
prelude::SliceRandom,
Rng,
};
use rand::{Rng, prelude::SliceRandom};
use std::{
collections::{HashSet, HashMap},
collections::{HashMap, HashSet},
fs::{self, File},
sync::{Arc, Mutex},
thread,
Expand Down Expand Up @@ -78,7 +76,7 @@ mod tests {
let local_results: HashMap<usize, Vec<u128>> = chunk
.into_iter()
.map(|(query_id, query_vec)| {
let query_hvector = HVector::from_slice(0, query_vec);
let query_hvector = HVector::from_slice(0, VectorData::F64(query_vec));

let mut distances: Vec<(u128, f64)> = base_vectors
.iter()
Expand All @@ -88,26 +86,23 @@ mod tests {
.map(|dist| (base_vec.id.clone(), dist))
.ok()
})
.collect();
.collect();

distances.sort_by(|a, b| {
a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)
});

let top_k_ids: Vec<u128> = distances
.into_iter()
.take(k)
.map(|(id, _)| id)
.collect();
let top_k_ids: Vec<u128> =
distances.into_iter().take(k).map(|(id, _)| id).collect();

(query_id, top_k_ids)
})
.collect();
.collect();

results.lock().unwrap().extend(local_results);
})
})
.collect();
.collect();

for handle in handles {
handle.join().unwrap();
Expand Down Expand Up @@ -319,7 +314,9 @@ mod tests {
let over_all_time = Instant::now();
for (i, data) in base_vectors.iter().enumerate() {
let start_time = Instant::now();
let vec = index.insert::<Filter>(&mut txn, &data, None).unwrap();
let vec = index
.insert::<Filter>(&mut txn, VectorData::F64(data.clone()), None)
.unwrap();
let time = start_time.elapsed();
base_all_vectors.push(vec);
//println!("{} => inserting in {} ms", i, time.as_millis());
Expand Down Expand Up @@ -354,7 +351,9 @@ mod tests {
let mut total_search_time = std::time::Duration::from_secs(0);
for (qid, query) in query_vectors.iter() {
let start_time = Instant::now();
let results = index.search::<Filter>(&txn, query, k, "vector", None, false).unwrap();
let results = index
.search::<Filter>(&txn, &VectorData::F64(query.clone()), k, "vector", None, false)
.unwrap();
let search_duration = start_time.elapsed();
total_search_time += search_duration;

Expand Down Expand Up @@ -400,4 +399,3 @@ mod tests {
}

// TODO: memory benchmark (only the hnsw index ofc)

9 changes: 7 additions & 2 deletions helix-db/src/grammar.pest
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,14 @@ source = { SOI ~ (schema_def | migration_def | query_def)* ~ EOI }
// Schema definitions
// ---------------------------------------------------------------------
schema_def = {( schema_version ~ "{" ~ (vector_def | node_def | edge_def)* ~ "}") | (vector_def | node_def | edge_def) }
vector_def = { "V::" ~ identifier_upper ~ node_body? }
vector_def = { "V::" ~ identifier_upper ~ precision? ~ node_body? }
node_def = { "N::" ~ identifier_upper ~ node_body? }
edge_def = { "E::" ~ identifier_upper ~ edge_body }
precision = { "<" ~ (f64 | f32 | f16) ~ ">" }
f64 = { "F64" }
f32 = { "F32" }
f16 = { "F16" }


node_body = { "{" ~ field_defs ~ "}" }
edge_body = { "{" ~ "From:" ~ identifier_upper ~ "," ~ ("To:" ~ identifier_upper ~ "," ~ properties ~ "}" | "To:" ~ identifier_upper ~ ","? ~ "}") }
Expand Down Expand Up @@ -264,7 +269,7 @@ id_arg = { (identifier | string_literal) }
id_args = { (id_arg) ~ ("," ~ id_arg)* }
array = { "[" ~ param_type ~ "]" }
object = { "{" ~ field_defs ~ "}" }
named_type = { "String" | "Boolean" | "F32" | "F64" | "I8" | "I16" | "I32" | "I64" | "U8" | "U16" | "U32" | "U64" | "U128" }
named_type = { "String" | "Boolean" | "F16" | "F32" | "F64" | "I8" | "I16" | "I32" | "I64" | "U8" | "U16" | "U32" | "U64" | "U128" }
ID_TYPE = { "ID" }
date_type = { "Date" }
param_type = { named_type | date_type | ID_TYPE | array | object | identifier }
Expand Down
60 changes: 32 additions & 28 deletions helix-db/src/helix_engine/bm25/bm25.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
use crate::{
debug_println,
helix_engine::{
storage_core::HelixGraphStorage,
types::GraphError,
vector_core::{hnsw::HNSW, vector::HVector},
vector_core::{VectorData, hnsw::HNSW, vector::HVector},
},
protocol::value::Value,
debug_println,
};

use heed3::{types::*, Database, Env, RoTxn, RwTxn};
use heed3::{Database, Env, RoTxn, RwTxn, types::*};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use tokio::task;
Expand Down Expand Up @@ -82,10 +82,10 @@ impl HBM25Config {

let doc_lengths_db: Database<U128<heed3::byteorder::BE>, U32<heed3::byteorder::BE>> =
graph_env
.database_options()
.types::<U128<heed3::byteorder::BE>, U32<heed3::byteorder::BE>>()
.name(DB_BM25_DOC_LENGTHS)
.create(wtxn)?;
.database_options()
.types::<U128<heed3::byteorder::BE>, U32<heed3::byteorder::BE>>()
.name(DB_BM25_DOC_LENGTHS)
.create(wtxn)?;

let term_frequencies_db: Database<Bytes, U32<heed3::byteorder::BE>> = graph_env
.database_options()
Expand All @@ -110,7 +110,11 @@ impl HBM25Config {
})
}

pub fn new_temp(graph_env: &Env, wtxn: &mut RwTxn, uuid: &str) -> Result<HBM25Config, GraphError> {
pub fn new_temp(
graph_env: &Env,
wtxn: &mut RwTxn,
uuid: &str,
) -> Result<HBM25Config, GraphError> {
let inverted_index_db: Database<Bytes, Bytes> = graph_env
.database_options()
.types::<Bytes, Bytes>()
Expand All @@ -120,10 +124,10 @@ impl HBM25Config {

let doc_lengths_db: Database<U128<heed3::byteorder::BE>, U32<heed3::byteorder::BE>> =
graph_env
.database_options()
.types::<U128<heed3::byteorder::BE>, U32<heed3::byteorder::BE>>()
.name(format!("{DB_BM25_DOC_LENGTHS}_{uuid}").as_str())
.create(wtxn)?;
.database_options()
.types::<U128<heed3::byteorder::BE>, U32<heed3::byteorder::BE>>()
.name(format!("{DB_BM25_DOC_LENGTHS}_{uuid}").as_str())
.create(wtxn)?;

let term_frequencies_db: Database<Bytes, U32<heed3::byteorder::BE>> = graph_env
.database_options()
Expand Down Expand Up @@ -188,7 +192,7 @@ impl BM25 for HBM25Config {
let current_df = self.term_frequencies_db.get(txn, term_bytes)?.unwrap_or(0);
self.term_frequencies_db
.put(txn, term_bytes, &(current_df + 1))?;
}
}

let mut metadata = if let Some(data) = self.metadata_db.get(txn, METADATA_KEY)? {
bincode::deserialize::<BM25Metadata>(data)?
Expand Down Expand Up @@ -400,7 +404,7 @@ impl HybridSearch for HelixGraphStorage {
limit: usize,
) -> Result<Vec<(u128, f32)>, GraphError> {
let query_owned = query.to_string();
let query_vector_owned = query_vector.to_vec();
let query_vector_owned = VectorData::from_f64_slice(query_vector);

let graph_env_bm25 = self.graph_env.clone();
let graph_env_vector = self.graph_env.clone();
Expand All @@ -413,18 +417,19 @@ impl HybridSearch for HelixGraphStorage {
}
});

let vector_handle = task::spawn_blocking(move || -> Result<Option<Vec<HVector>>, GraphError> {
let txn = graph_env_vector.read_txn()?;
let results = self.vectors.search::<fn(&HVector, &RoTxn) -> bool>(
&txn,
&query_vector_owned,
limit * 2,
"vector",
None,
false,
)?;
Ok(Some(results))
});
let vector_handle =
task::spawn_blocking(move || -> Result<Option<Vec<HVector>>, GraphError> {
let txn = graph_env_vector.read_txn()?;
let results = self.vectors.search::<fn(&HVector, &RoTxn) -> bool>(
&txn,
&query_vector_owned,
limit * 2,
"vector",
None,
false,
)?;
Ok(Some(results))
});

let (bm25_results, vector_results) = match tokio::try_join!(bm25_handle, vector_handle) {
Ok((a, b)) => (a, b),
Expand All @@ -447,7 +452,7 @@ impl HybridSearch for HelixGraphStorage {
.entry(doc_id)
.and_modify(|existing_score| *existing_score += (1.0 - alpha) * similarity)
.or_insert((1.0 - alpha) * similarity); // correction made here from score as f32 to similarity
}
}
}

let mut results = combined_scores.into_iter().collect::<Vec<(u128, f32)>>();
Expand Down Expand Up @@ -475,4 +480,3 @@ impl BM25Flatten for HashMap<String, Value> {
})
}
}

26 changes: 16 additions & 10 deletions helix-db/src/helix_engine/bm25/bm25_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ mod tests {
},
storage_core::{HelixGraphStorage, version_info::VersionInfo},
traversal_core::config::Config,
vector_core::{hnsw::HNSW, vector::HVector},
vector_core::{VectorData, hnsw::HNSW, vector::HVector},
},
protocol::value::Value,
};
Expand Down Expand Up @@ -1424,9 +1424,11 @@ mod tests {
let mut wtxn = storage.graph_env.write_txn().unwrap();
let vectors = generate_random_vectors(800, 650);
for vec in vectors {
let _ = storage
.vectors
.insert::<fn(&HVector, &RoTxn) -> bool>(&mut wtxn, &vec, None);
let _ = storage.vectors.insert::<fn(&HVector, &RoTxn) -> bool>(
&mut wtxn,
VectorData::F64(vec),
None,
);
}
wtxn.commit().unwrap();

Expand Down Expand Up @@ -1466,9 +1468,11 @@ mod tests {
let mut wtxn = storage.graph_env.write_txn().unwrap();
let vectors = generate_random_vectors(800, 650);
for vec in vectors {
let _ = storage
.vectors
.insert::<fn(&HVector, &RoTxn) -> bool>(&mut wtxn, &vec, None);
let _ = storage.vectors.insert::<fn(&HVector, &RoTxn) -> bool>(
&mut wtxn,
VectorData::F64(vec),
None,
);
}
wtxn.commit().unwrap();

Expand Down Expand Up @@ -1509,9 +1513,11 @@ mod tests {
let mut wtxn = storage.graph_env.write_txn().unwrap();
let vectors = generate_random_vectors(800, 650);
for vec in vectors {
let _ = storage
.vectors
.insert::<fn(&HVector, &RoTxn) -> bool>(&mut wtxn, &vec, None);
let _ = storage.vectors.insert::<fn(&HVector, &RoTxn) -> bool>(
&mut wtxn,
VectorData::F64(vec),
None,
);
}
wtxn.commit().unwrap();

Expand Down
Loading
Loading