From b5dd2259b80ee0e52c678ed0363825b46e75c289 Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Sat, 25 May 2024 11:38:55 -0500
Subject: [PATCH] style and requirements

---
 ch07/02_dataset-utilities/README.md           |  8 +++++
 .../find-near-duplicates.py                   | 31 ++++++++++++++-----
 .../requirements-extra.txt                    |  2 ++
 3 files changed, 33 insertions(+), 8 deletions(-)
 create mode 100644 ch07/02_dataset-utilities/requirements-extra.txt

diff --git a/ch07/02_dataset-utilities/README.md b/ch07/02_dataset-utilities/README.md
index ad132c6f..a9828ea2 100644
--- a/ch07/02_dataset-utilities/README.md
+++ b/ch07/02_dataset-utilities/README.md
@@ -2,6 +2,14 @@
 
 This folder contains utility code that can be used for preparing an instruction dataset.
 
+Install the additional package requirements via:
+
+```bash
+pip install -r requirements-extra.txt
+```
+
+
+
 
 
 ### Finding near duplicates
diff --git a/ch07/02_dataset-utilities/find-near-duplicates.py b/ch07/02_dataset-utilities/find-near-duplicates.py
index d74b39d3..f35f5de4 100644
--- a/ch07/02_dataset-utilities/find-near-duplicates.py
+++ b/ch07/02_dataset-utilities/find-near-duplicates.py
@@ -12,11 +12,19 @@
 
 # Sample JSON dataset
 example_data = [
-    {"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."},
-    {"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."},
-    {"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."},
-    {"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""},
-    # Add other entries...
+    {"instruction": "What is the capital of Italy?",
+     "input": "", "output": "The capital of Italy is Rome."
+     },
+    {"instruction": "What's the capital city of Italy?",
+     "input": "", "output": "The capital city is Rome."
+     },
+    {"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'",
+     "input": "", "output": "The verb is 'sleeps'."
+     },
+    {"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.",
+     "input": "", "output": "The verb in the sentence is \"sleeps.\""
+     },
+    # ...
 ]
 
 
@@ -48,15 +56,22 @@ def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
 
 
 def find_and_print_new_duplicates(json_data):
+    """
+    Searches each key in the first JSON object for duplicates across a list of JSON objects.
+    Prints the duplicates if found.
+    """
     for key in json_data[0].keys():
         near_duplicates = find_near_duplicates(json_data, key=key)
-        print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}")
+        separator = 50 * '='
+        print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}")
         if not near_duplicates:
             print("No duplicates found")
         else:
             for dup in near_duplicates:
-                print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
-                      f"1. {dup[0][key]}\n2. {dup[1][key]}\n")
+                print(
+                    f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
+                    f"1. {dup[0][key]}\n2. {dup[1][key]}\n"
+                )
 
 
 if __name__ == "__main__":
diff --git a/ch07/02_dataset-utilities/requirements-extra.txt b/ch07/02_dataset-utilities/requirements-extra.txt
new file mode 100644
index 00000000..db9b9f74
--- /dev/null
+++ b/ch07/02_dataset-utilities/requirements-extra.txt
@@ -0,0 +1,2 @@
+openai
+scikit-learn
\ No newline at end of file