Skip to content

Commit

Permalink
style and requirements
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed May 25, 2024
1 parent abdb2fc commit b5dd225
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 8 deletions.
8 changes: 8 additions & 0 deletions ch07/02_dataset-utilities/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

This folder contains utility code that can be used for preparing an instruction dataset.

Install the additional package requirements via:

```bash
pip install -r requirements-extra.txt
```





### Finding near duplicates
Expand Down
31 changes: 23 additions & 8 deletions ch07/02_dataset-utilities/find-near-duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,19 @@

# Sample JSON dataset
example_data = [
{"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."},
{"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."},
{"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."},
{"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""},
# Add other entries...
{"instruction": "What is the capital of Italy?",
"input": "", "output": "The capital of Italy is Rome."
},
{"instruction": "What's the capital city of Italy?",
"input": "", "output": "The capital city is Rome."
},
{"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'",
"input": "", "output": "The verb is 'sleeps'."
},
{"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.",
"input": "", "output": "The verb in the sentence is \"sleeps.\""
},
# ...
]


Expand Down Expand Up @@ -48,15 +56,22 @@ def find_near_duplicates(json_data, threshold=0.8, key="instruction"):


def find_and_print_new_duplicates(json_data):
"""
Searches each key in the first JSON object for duplicates across a list of JSON objects.
Prints the duplicates if found.
"""
for key in json_data[0].keys():
near_duplicates = find_near_duplicates(json_data, key=key)
print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}")
separator = 50 * '='
print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}")
if not near_duplicates:
print("No duplicates found")
else:
for dup in near_duplicates:
print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
f"1. {dup[0][key]}\n2. {dup[1][key]}\n")
print(
f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
f"1. {dup[0][key]}\n2. {dup[1][key]}\n"
)


if __name__ == "__main__":
Expand Down
2 changes: 2 additions & 0 deletions ch07/02_dataset-utilities/requirements-extra.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
openai
scikit-learn

0 comments on commit b5dd225

Please sign in to comment.