Skip to content

Commit

Permalink
- Add similarity measuring methods
Browse files Browse the repository at this point in the history
  • Loading branch information
ardakaraman0 committed Feb 17, 2025
1 parent ac7bdeb commit bb184d0
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 0 deletions.
35 changes: 35 additions & 0 deletions AtlasMl/atlasml/ml/SimilarityMeasurement/Cosine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import numpy as np
from scipy.spatial.distance import cosine
from AtlasMl.atlasml.ml.VectorEmbeddings.ModelDimension import ModelDimension


def compute_cosine_similarity(embedding_vector, model: ModelDimension, comparison_vector):
"""
Computes the cosine similarity between two embedding vectors.
Parameters:
embedding_vector (iterable): The input vector.
model (ModelDimension): An enum member indicating the dimension of the embedding_vector.
comparison_vector (iterable): The second embedding vector to compare with.
Returns:
float: The cosine similarity between the two vectors.
"""
# Convert inputs to numpy arrays (in case they aren't already)
emb1 = np.array(embedding_vector)
emb2 = np.array(comparison_vector)

# Ensure both vectors have the same shape
if emb1.shape != emb2.shape:
raise ValueError("Both vectors must have the same dimensions.")
# TODO: Add vector re-shaper

# Calculate cosine similarity
similarity = 1.0 - cosine(emb1, emb2)
return similarity


# Sanity Check
example_vec1 = [1, 2, 3]
example_vec2 = [4, 5, 6]
computed_distance = compute_cosine_similarity(example_vec1, ModelDimension.text_embedding_three_small, example_vec2)
35 changes: 35 additions & 0 deletions AtlasMl/atlasml/ml/SimilarityMeasurement/Euclidian.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import numpy as np
from scipy.spatial.distance import euclidean
from AtlasMl.atlasml.ml.VectorEmbeddings.ModelDimension import ModelDimension


def compute_euclidean_distance(embedding_vector, model: ModelDimension, comparison_vector):
"""
Computes the Euclidean distance between two embedding vectors.
Parameters:
embedding_vector (iterable): The input vector.
model (ModelDimension): An enum member indicating the dimension of the embedding_vector.
comparison_vector (iterable): The second embedding vector to compare with.
Returns:
float: The Euclidean distance between the two vectors.
"""
# Convert inputs to numpy arrays (in case they aren't already)
vec1 = np.array(embedding_vector)
vec2 = np.array(comparison_vector)

# Ensure both vectors have the same shape
if vec1.shape != vec2.shape:
raise ValueError("Both vectors must have the same dimensions.")
# TODO: Add vector re-shaper

# Calculate Euclidean distance
distance = euclidean(vec1, vec2)
return distance


# Sanity Check
example_vec1 = [1, 2, 3]
example_vec2 = [4, 5, 6]
computed_distance = compute_euclidean_distance(example_vec1, ModelDimension.text_embedding_three_small, example_vec2)
36 changes: 36 additions & 0 deletions AtlasMl/atlasml/ml/SimilarityMeasurement/Jaccard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import numpy as np
from scipy.spatial.distance import jaccard
from AtlasMl.atlasml.ml.VectorEmbeddings.ModelDimension import ModelDimension


def compute_jaccard_similarity(embedding_vector, model: ModelDimension, comparison_vector):
"""
Computes the Jaccard similarity between two vectors.
Parameters:
embedding_vector (iterable): The input vector.
model (ModelDimension): An enum member indicating the dimension of the embedding_vector.
comparison_vector (iterable): The second embedding vector to compare with.
Returns:
float: The Jaccard similarity between the two vectors.
"""
# Convert inputs to numpy arrays
vec1 = np.array(embedding_vector)
vec2 = np.array(comparison_vector)

# Ensure both vectors have the same shape
if vec1.shape != vec2.shape:
raise ValueError("Both vectors must have the same dimensions.")
# TODO: Add vector re-shaper

# Calculate Jaccard distance then convert to similarity
distance = jaccard(vec1, vec2)
similarity = 1.0 - distance
return similarity


# Sanity Check
example_vec1 = [1, 2, 3]
example_vec2 = [4, 5, 6]
computed_distance = compute_jaccard_similarity(example_vec1, ModelDimension.text_embedding_three_small, example_vec2)
Empty file.
6 changes: 6 additions & 0 deletions AtlasMl/atlasml/ml/VectorEmbeddings/ModelDimension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from enum import Enum

class ModelDimension(Enum):
text_embedding_three_small = 1536
text_embedding_three_large = 3072
# Add more models that are to be used and their vector dimensions

0 comments on commit bb184d0

Please sign in to comment.