diff --git a/ch07/02_dataset-utilities/README.md b/ch07/02_dataset-utilities/README.md index 4c49a2fc..4b4066a8 100644 --- a/ch07/02_dataset-utilities/README.md +++ b/ch07/02_dataset-utilities/README.md @@ -12,7 +12,7 @@ pip install -r requirements-extra.txt -### Finding near duplicates +### Finding Near Duplicates The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example, @@ -23,6 +23,9 @@ python find-near-duplicates.py --json_file instruction-examples.json ``` ``` +scikit-learn version: 1.3.1 + + ================================================== Searching 'instruction' for duplicates ... ================================================== @@ -56,6 +59,22 @@ Duplicate pair found with similarity 1.00: ``` -  +  You can use the `--threshold` setting with a value between 0 and 1 to decrease or increase the sensitivity. The default threshold is 0.9. + + + +  + ## Creating Passive Voice Entries + + - The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below + + ```python + { + 'instruction': 'Identify the verb in the following sentence', + 'input': 'The cat sleeps on the couch.', + 'output': 'The verb in the sentence is "sleeps."', + 'output_2': 'The sentence is "sleeps."' # <---- Newly created entry + } + ``` diff --git a/ch07/02_dataset-utilities/find-near-duplicates.py b/ch07/02_dataset-utilities/find-near-duplicates.py index b49b26d5..45b2fceb 100644 --- a/ch07/02_dataset-utilities/find-near-duplicates.py +++ b/ch07/02_dataset-utilities/find-near-duplicates.py @@ -61,7 +61,7 @@ def find_near_duplicates(json_data, threshold=0.75, key="instruction"): for i in range(len(cos_sim_matrix)): for j in range(i+1, len(cos_sim_matrix)): if cos_sim_matrix[i, j] > threshold: - if len(json_data[i][key]) <= 1 or len(json_data[j][key]) <=1: + if len(json_data[i][key]) <= 1 or len(json_data[j][key]) <= 1: continue near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j])) if key in ("input", "output"): # Don't remove duplicates based on the instruction