-
Notifications
You must be signed in to change notification settings - Fork 21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Auto dataset concatenation prototype #128
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
fast_llm/data/dataset/gpt/config.py
Outdated
class GPTComposedDatasetConfig(GPTIndexedDatasetConfig): | ||
_abstract: typing.ClassVar[bool] = False | ||
type_: typing.ClassVar[str | None] = "composed" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
class GPTComposedDatasetConfig(GPTIndexedDatasetConfig): | |
_abstract: typing.ClassVar[bool] = False | |
type_: typing.ClassVar[str | None] = "composed" | |
class GPTConcatenatedMemmapConfig(GPTIndexedDatasetConfig): | |
_abstract: typing.ClassVar[bool] = False | |
type_: typing.ClassVar[str | None] = "concatenated_memmap" |
for your convenience, so that we can merge this easily.
tests/test_dataset.py
Outdated
@@ -10,6 +10,7 @@ | |||
from fast_llm.data.data.gpt.data import GPTData | |||
from fast_llm.data.dataset.gpt.config import ( | |||
GPTBlendedDatasetConfig, | |||
GPTComposedDatasetConfig, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
GPTComposedDatasetConfig, | |
GPTConcatenatedMemmapConfig, |
tests/test_dataset.py
Outdated
{"type": "composed", "path": _DATASET_PREFIX_MIX_COMPOSED}, | ||
GPTComposedDatasetConfig, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
{"type": "composed", "path": _DATASET_PREFIX_MIX_COMPOSED}, | |
GPTComposedDatasetConfig, | |
{"type": "concatenated_memmap", "path": _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP}, | |
GPTConcatenatedMemmapConfig, |
tests/test_dataset.py
Outdated
@@ -81,11 +88,16 @@ def get_test_data_and_samples( | |||
return data, samples | |||
|
|||
|
|||
DATASET_PREFIX_MIX_1 = DATASET_PREFIX.with_name("blended_mix_1") | |||
_DATASET_PREFIX_MIX_1 = DATASET_PREFIX.with_name("blended_mix_1") | |||
_DATASET_PREFIX_MIX_COMPOSED = DATASET_CACHE / "composed" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_DATASET_PREFIX_MIX_COMPOSED = DATASET_CACHE / "composed" | |
_DATASET_PREFIX_MIX_CONCATENATED_MEMMAP = DATASET_CACHE / "concatenated_memmap" |
β¨ Description
Fixes: #120. A basic approach, to be refined in #123 .
π Type of change
Select all that apply: