Skip to content

Split up generate_data and add a mix_datasets top level API #443

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,6 @@ cython_debug/

# IDEs
.vscode/

# SDG examples output
docs/examples/**/output
8 changes: 8 additions & 0 deletions docs/examples/mix_datasets/concatenate_recipe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# An example of how to concatenate two datasets
# Each dataset has a sampling_size of 1.0 to take all samples from both
datasets:
- path: dataset_1.jsonl
sampling_size: 1.0
- path: dataset_2.jsonl
sampling_size: 1.0
sys_prompt: I am a reliable AI assistant.
5 changes: 5 additions & 0 deletions docs/examples/mix_datasets/dataset_1.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"id": "dataset_1_1", "messages": [], "metadata": {}}
{"id": "dataset_1_2", "messages": [], "metadata": {}}
{"id": "dataset_1_3", "messages": [], "metadata": {}}
{"id": "dataset_1_4", "messages": [], "metadata": {}}
{"id": "dataset_1_5", "messages": [], "metadata": {}}
5 changes: 5 additions & 0 deletions docs/examples/mix_datasets/dataset_2.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"id": "dataset_2_1", "messages": [], "metadata": {}}
{"id": "dataset_2_2", "messages": [], "metadata": {}}
{"id": "dataset_2_3", "messages": [], "metadata": {}}
{"id": "dataset_2_4", "messages": [], "metadata": {}}
{"id": "dataset_2_5", "messages": [], "metadata": {}}
18 changes: 18 additions & 0 deletions docs/examples/mix_datasets/example_mixing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
from pathlib import Path

# First Party
from instructlab.sdg import mix_datasets

output_dir = Path(__file__).parent.joinpath("output")
output_dir.mkdir(exist_ok=True)

concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml")
concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl")
mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl)

weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml")
weighted_output_jsonl = output_dir.joinpath("weighted.jsonl")
mix_datasets(weighted_recipe_yaml, weighted_output_jsonl)
9 changes: 9 additions & 0 deletions docs/examples/mix_datasets/weighted_recipe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# An example of how to weight one dataset over another
# Dataset 1 has a sampling size of 2.0 to double its samples
# Dataset 2 has a sampling size of 0.2 to take 20% of its samples
datasets:
- path: dataset_1.jsonl
sampling_size: 2.0
- path: dataset_2.jsonl
sampling_size: 0.2
sys_prompt: I am a reliable AI assistant.
3 changes: 2 additions & 1 deletion src/instructlab/sdg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"FULL_PIPELINES_PACKAGE",
"SIMPLE_PIPELINES_PACKAGE",
"generate_data",
"mix_datasets",
)

# Local
Expand All @@ -50,7 +51,7 @@
SelectorBlock,
SetToMajorityValueBlock,
)
from .generate_data import generate_data
from .generate_data import generate_data, mix_datasets
from .pipeline import (
FULL_PIPELINES_PACKAGE,
SIMPLE_PIPELINES_PACKAGE,
Expand Down
23 changes: 20 additions & 3 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc):
Create the final mixed dataset by loading, sampling, and
concatenating all datasets in this recipe
"""
if not self.dataset_added:
if not self.datasets:
logger.error("No dataset added to the recipe")

mixed_ds = self._load_and_sample_datasets(num_proc)
Expand Down Expand Up @@ -726,19 +726,36 @@ def collect(
sampling_size=self.NUM_SYNTH_SKILLS,
)

def _write_mixed_recipe(self, recipe, output_file_recipe):
"""
Write the recipes created during data mixing without writing the actual
mixed datasets to disk.
"""
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
recipe.save_recipe(full_recipe_path)

def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
"""
Mix the generated leaf node data into a single dataset and write it to
disk. The heavy lifting is delegated to the Recipe class.
"""
self._write_mixed_recipe(recipe, output_file_recipe)
if recipe.dataset_added:
full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
recipe.save_recipe(full_recipe_path)
recipe.save_mixed_dataset(
os.path.join(self.output_dir, output_file_data),
self.num_procs,
)

def write_recipes(self):
self._write_mixed_recipe(
self.knowledge_recipe,
self.output_file_knowledge_recipe,
)
self._write_mixed_recipe(
self.skills_recipe,
self.output_file_skills_recipe,
)

def generate(self):
self._gen_mixed_data(
self.knowledge_recipe,
Expand Down
Loading
Loading