forked from rasbt/LLMs-from-scratch
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
1,146 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# Chapter 7: Instruction and Preference Finetuning | ||
|
||
This folder contains utility code that can be used for preparing an instruction dataset. | ||
|
||
|
||
|
||
### Finding near duplicates | ||
|
||
The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example, | ||
|
||
|
||
|
||
```python | ||
python find-near-duplicates.py --json_file instruction-examples.json | ||
``` | ||
|
||
``` | ||
================================================== | ||
Searching 'instruction' for duplicates ... | ||
================================================== | ||
Duplicate pair found with similarity 0.85: | ||
1. Determine the state of matter for helium at room temperature. | ||
2. Determine the state of matter for nitrogen at room temperature. | ||
Duplicate pair found with similarity 0.98: | ||
1. Edit the following sentence to make it more formal. | ||
2. Edit the sentence to make it more formal. | ||
Duplicate pair found with similarity 1.00: | ||
1. Name a dwarf planet in our solar system. | ||
2. Name a dwarf planet in our solar system. | ||
Duplicate pair found with similarity 0.88: | ||
1. Change the sentences from active voice to passive voice. | ||
2. Change the sentence from passive to active voice. | ||
================================================== | ||
Searching 'input' for duplicates ... | ||
================================================== | ||
Duplicate pair found with similarity 0.88: | ||
1. | ||
2. She said, "I am tired." | ||
================================================== | ||
Searching 'output' for duplicates ... | ||
================================================== | ||
Duplicate pair found with similarity 0.82: | ||
1. Helium is in a gaseous state at room temperature. | ||
2. Nitrogen is in a gaseous state at room temperature. | ||
Duplicate pair found with similarity 1.00: | ||
1. One dwarf planet in our solar system is Pluto. | ||
2. One dwarf planet in our solar system is Pluto. | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
|
||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). | ||
# Source for "Build a Large Language Model From Scratch" | ||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch | ||
# Code: https://github.com/rasbt/LLMs-from-scratch | ||
|
||
import argparse | ||
import json | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from sklearn.metrics.pairwise import cosine_similarity | ||
|
||
|
||
# Sample JSON dataset | ||
example_data = [ | ||
{"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."}, | ||
{"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."}, | ||
{"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."}, | ||
{"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""}, | ||
# Add other entries... | ||
] | ||
|
||
|
||
def find_near_duplicates(json_data, threshold=0.8, key="instruction"): | ||
"""The higher the threshold, the more similar the texts have to be to match""" | ||
|
||
# Extract instructions | ||
text = [item[key] for item in json_data if item[key]] | ||
near_duplicates = [] | ||
|
||
if not text: | ||
return near_duplicates | ||
|
||
# Vectorize the text data | ||
vectorizer = TfidfVectorizer(stop_words=None) | ||
tfidf_matrix = vectorizer.fit_transform(text) | ||
|
||
# Compute cosine similarity between each pair of entries | ||
cos_sim_matrix = cosine_similarity(tfidf_matrix) | ||
|
||
# Find pairs of near-duplicate instructions based on the threshold | ||
|
||
for i in range(len(cos_sim_matrix)): | ||
for j in range(i+1, len(cos_sim_matrix)): | ||
if cos_sim_matrix[i, j] > threshold: | ||
near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j])) | ||
|
||
return near_duplicates | ||
|
||
|
||
def find_and_print_new_duplicates(json_data): | ||
for key in json_data[0].keys(): | ||
near_duplicates = find_near_duplicates(json_data, key=key) | ||
print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}") | ||
if not near_duplicates: | ||
print("No duplicates found") | ||
else: | ||
for dup in near_duplicates: | ||
print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n" | ||
f"1. {dup[0][key]}\n2. {dup[1][key]}\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--json_file", | ||
type=str, | ||
help=("Path to the dataset JSON file") | ||
) | ||
args = parser.parse_args() | ||
if not args.json_file: | ||
json_data = example_data | ||
|
||
else: | ||
with open(args.json_file, "r") as file: | ||
json_data = json.load(file) | ||
|
||
find_and_print_new_duplicates(json_data) |
Oops, something went wrong.