Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AtlasML: Add similarity measurement methods #43

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions AtlasMl/atlasml/ml/SimilarityMeasurement/Cosine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import numpy as np
from scipy.spatial.distance import cosine
from AtlasMl.atlasml.ml.VectorEmbeddings.ModelDimension import ModelDimension


def compute_cosine_similarity(embedding_vector, model: ModelDimension, comparison_vector):
"""
Computes the cosine similarity between two embedding vectors.

Parameters:
embedding_vector (iterable): The input vector.
model (ModelDimension): An enum member indicating the dimension of the embedding_vector.
comparison_vector (iterable): The second embedding vector to compare with.

Returns:
float: The cosine similarity between the two vectors.
"""
# Convert inputs to numpy arrays (in case they aren't already)
emb1 = np.array(embedding_vector)
emb2 = np.array(comparison_vector)

# Ensure both vectors have the same shape
if emb1.shape != emb2.shape:
raise ValueError("Both vectors must have the same dimensions.")
# TODO: Add vector re-shaper

# Calculate cosine similarity
similarity = 1.0 - cosine(emb1, emb2)
return similarity


# Sanity Check
example_vec1 = [1, 2, 3]
example_vec2 = [4, 5, 6]
computed_distance = compute_cosine_similarity(example_vec1, ModelDimension.text_embedding_three_small, example_vec2)
35 changes: 35 additions & 0 deletions AtlasMl/atlasml/ml/SimilarityMeasurement/Euclidian.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import numpy as np
from scipy.spatial.distance import euclidean
from AtlasMl.atlasml.ml.VectorEmbeddings.ModelDimension import ModelDimension


def compute_euclidean_distance(embedding_vector, model: ModelDimension, comparison_vector):
"""
Computes the Euclidean distance between two embedding vectors.

Parameters:
embedding_vector (iterable): The input vector.
model (ModelDimension): An enum member indicating the dimension of the embedding_vector.
comparison_vector (iterable): The second embedding vector to compare with.

Returns:
float: The Euclidean distance between the two vectors.
"""
# Convert inputs to numpy arrays (in case they aren't already)
vec1 = np.array(embedding_vector)
vec2 = np.array(comparison_vector)

# Ensure both vectors have the same shape
if vec1.shape != vec2.shape:
raise ValueError("Both vectors must have the same dimensions.")
# TODO: Add vector re-shaper

# Calculate Euclidean distance
distance = euclidean(vec1, vec2)
return distance


# Sanity Check
example_vec1 = [1, 2, 3]
example_vec2 = [4, 5, 6]
computed_distance = compute_euclidean_distance(example_vec1, ModelDimension.text_embedding_three_small, example_vec2)
36 changes: 36 additions & 0 deletions AtlasMl/atlasml/ml/SimilarityMeasurement/Jaccard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import numpy as np
from scipy.spatial.distance import jaccard
from AtlasMl.atlasml.ml.VectorEmbeddings.ModelDimension import ModelDimension


def compute_jaccard_similarity(embedding_vector, model: ModelDimension, comparison_vector):
"""
Computes the Jaccard similarity between two vectors.

Parameters:
embedding_vector (iterable): The input vector.
model (ModelDimension): An enum member indicating the dimension of the embedding_vector.
comparison_vector (iterable): The second embedding vector to compare with.

Returns:
float: The Jaccard similarity between the two vectors.
"""
# Convert inputs to numpy arrays
vec1 = np.array(embedding_vector)
vec2 = np.array(comparison_vector)

# Ensure both vectors have the same shape
if vec1.shape != vec2.shape:
raise ValueError("Both vectors must have the same dimensions.")
# TODO: Add vector re-shaper

# Calculate Jaccard distance then convert to similarity
distance = jaccard(vec1, vec2)
similarity = 1.0 - distance
return similarity


# Sanity Check
example_vec1 = [1, 2, 3]
example_vec2 = [4, 5, 6]
computed_distance = compute_jaccard_similarity(example_vec1, ModelDimension.text_embedding_three_small, example_vec2)
Empty file.
6 changes: 6 additions & 0 deletions AtlasMl/atlasml/ml/VectorEmbeddings/ModelDimension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from enum import Enum

class ModelDimension(Enum):
text_embedding_three_small = 1536
text_embedding_three_large = 3072
# Add more models that are to be used and their vector dimensions
Loading