From b5dd2259b80ee0e52c678ed0363825b46e75c289 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sat, 25 May 2024 11:38:55 -0500 Subject: [PATCH] style and requirements --- ch07/02_dataset-utilities/README.md | 8 +++++ .../find-near-duplicates.py | 31 ++++++++++++++----- .../requirements-extra.txt | 2 ++ 3 files changed, 33 insertions(+), 8 deletions(-) create mode 100644 ch07/02_dataset-utilities/requirements-extra.txt diff --git a/ch07/02_dataset-utilities/README.md b/ch07/02_dataset-utilities/README.md index ad132c6f..a9828ea2 100644 --- a/ch07/02_dataset-utilities/README.md +++ b/ch07/02_dataset-utilities/README.md @@ -2,6 +2,14 @@ This folder contains utility code that can be used for preparing an instruction dataset. +Install the additional package requirements via: + +```bash +pip install -r requirements-extra.txt +``` + + + ### Finding near duplicates diff --git a/ch07/02_dataset-utilities/find-near-duplicates.py b/ch07/02_dataset-utilities/find-near-duplicates.py index d74b39d3..f35f5de4 100644 --- a/ch07/02_dataset-utilities/find-near-duplicates.py +++ b/ch07/02_dataset-utilities/find-near-duplicates.py @@ -12,11 +12,19 @@ # Sample JSON dataset example_data = [ - {"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."}, - {"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."}, - {"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."}, - {"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""}, - # Add other entries... + {"instruction": "What is the capital of Italy?", + "input": "", "output": "The capital of Italy is Rome." + }, + {"instruction": "What's the capital city of Italy?", + "input": "", "output": "The capital city is Rome." + }, + {"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", + "input": "", "output": "The verb is 'sleeps'." + }, + {"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", + "input": "", "output": "The verb in the sentence is \"sleeps.\"" + }, + # ... ] @@ -48,15 +56,22 @@ def find_near_duplicates(json_data, threshold=0.8, key="instruction"): def find_and_print_new_duplicates(json_data): + """ + Searches each key in the first JSON object for duplicates across a list of JSON objects. + Prints the duplicates if found. + """ for key in json_data[0].keys(): near_duplicates = find_near_duplicates(json_data, key=key) - print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}") + separator = 50 * '=' + print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}") if not near_duplicates: print("No duplicates found") else: for dup in near_duplicates: - print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n" - f"1. {dup[0][key]}\n2. {dup[1][key]}\n") + print( + f"Duplicate pair found with similarity {dup[2]:.2f}:\n" + f"1. {dup[0][key]}\n2. {dup[1][key]}\n" + ) if __name__ == "__main__": diff --git a/ch07/02_dataset-utilities/requirements-extra.txt b/ch07/02_dataset-utilities/requirements-extra.txt new file mode 100644 index 00000000..db9b9f74 --- /dev/null +++ b/ch07/02_dataset-utilities/requirements-extra.txt @@ -0,0 +1,2 @@ +openai +scikit-learn \ No newline at end of file