dataset utils

timmyjose-forks · May 25, 2024 · abdb2fc · abdb2fc
1 parent 49b8e2e
commit abdb2fc
Show file tree

Hide file tree

Showing 4 changed files with 1,146 additions and 0 deletions.
diff --git a/ch07/02_dataset-utilities/README.md b/ch07/02_dataset-utilities/README.md
@@ -0,0 +1,63 @@
+# Chapter 7: Instruction and Preference Finetuning
+
+This folder contains utility code that can be used for preparing an instruction dataset.
+
+
+
+### Finding near duplicates
+
+The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example,
+
+
+
+```python
+python find-near-duplicates.py --json_file instruction-examples.json
+```
+
+```
+
+
+==================================================
+ Searching 'instruction' for duplicates ...
+==================================================
+Duplicate pair found with similarity 0.85:
+1. Determine the state of matter for helium at room temperature.
+2. Determine the state of matter for nitrogen at room temperature.
+
+Duplicate pair found with similarity 0.98:
+1. Edit the following sentence to make it more formal.
+2. Edit the sentence to make it more formal.
+
+Duplicate pair found with similarity 1.00:
+1. Name a dwarf planet in our solar system.
+2. Name a dwarf planet in our solar system.
+
+Duplicate pair found with similarity 0.88:
+1. Change the sentences from active voice to passive voice.
+2. Change the sentence from passive to active voice.
+
+
+
+==================================================
+ Searching 'input' for duplicates ...
+==================================================
+Duplicate pair found with similarity 0.88:
+1. 
+2. She said, "I am tired."
+
+
+
+==================================================
+ Searching 'output' for duplicates ...
+==================================================
+Duplicate pair found with similarity 0.82:
+1. Helium is in a gaseous state at room temperature.
+2. Nitrogen is in a gaseous state at room temperature.
+
+Duplicate pair found with similarity 1.00:
+1. One dwarf planet in our solar system is Pluto.
+2. One dwarf planet in our solar system is Pluto.
+
+
+```
+
diff --git a/ch07/02_dataset-utilities/find-near-duplicates.py b/ch07/02_dataset-utilities/find-near-duplicates.py
@@ -0,0 +1,78 @@
+
+# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
+# Source for "Build a Large Language Model From Scratch"
+#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
+# Code: https://github.com/rasbt/LLMs-from-scratch
+
+import argparse
+import json
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+
+# Sample JSON dataset
+example_data = [
+    {"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."},
+    {"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."},
+    {"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."},
+    {"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""},
+    # Add other entries...
+]
+
+
+def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
+    """The higher the threshold, the more similar the texts have to be to match"""
+
+    # Extract instructions
+    text = [item[key] for item in json_data if item[key]]
+    near_duplicates = []
+
+    if not text:
+        return near_duplicates
+
+    # Vectorize the text data
+    vectorizer = TfidfVectorizer(stop_words=None)
+    tfidf_matrix = vectorizer.fit_transform(text)
+
+    # Compute cosine similarity between each pair of entries
+    cos_sim_matrix = cosine_similarity(tfidf_matrix)
+
+    # Find pairs of near-duplicate instructions based on the threshold
+
+    for i in range(len(cos_sim_matrix)):
+        for j in range(i+1, len(cos_sim_matrix)):
+            if cos_sim_matrix[i, j] > threshold:
+                near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))
+
+    return near_duplicates
+
+
+def find_and_print_new_duplicates(json_data):
+    for key in json_data[0].keys():
+        near_duplicates = find_near_duplicates(json_data, key=key)
+        print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}")
+        if not near_duplicates:
+            print("No duplicates found")
+        else:
+            for dup in near_duplicates:
+                print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
+                      f"1. {dup[0][key]}\n2. {dup[1][key]}\n")
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--json_file",
+        type=str,
+        help=("Path to the dataset JSON file")
+        )
+    args = parser.parse_args()
+    if not args.json_file:
+        json_data = example_data
+
+    else:
+        with open(args.json_file, "r") as file:
+            json_data = json.load(file)
+
+    find_and_print_new_duplicates(json_data)