Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prepare command #38

Merged
merged 31 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
7304119
fix GPTMemmapDataset
tscholak Nov 9, 2024
47d453b
fix GPTMemmapDataset
tscholak Nov 9, 2024
bef3a72
add prepare-dataset command
tscholak Nov 10, 2024
0ffc75c
add prepare-dataset command
tscholak Nov 10, 2024
fda6386
add prepare-dataset command
tscholak Nov 10, 2024
acae7d9
add prepare-dataset command
tscholak Nov 10, 2024
eb7da59
add prepare-dataset command
tscholak Nov 10, 2024
b5ed2f0
add prepare-dataset command
tscholak Nov 10, 2024
c8f746a
only push latest tag for commits to main
tscholak Nov 10, 2024
e0f813c
use older generics syntax
tscholak Nov 10, 2024
b88c9d3
remove user and install Fast-LLM globally
tscholak Nov 10, 2024
4df12d9
simplify Dockerfile
tscholak Nov 11, 2024
3737bc0
improvements
tscholak Nov 11, 2024
4b6b195
add docstring
tscholak Nov 11, 2024
52a6f0b
use full imports
tscholak Nov 11, 2024
55b0b88
use full imports
tscholak Nov 11, 2024
1f975d2
use full imports
tscholak Nov 11, 2024
b665e91
don't load tokenizer during validatin
tscholak Nov 11, 2024
af1439e
Merge remote-tracking branch 'origin/main' into tscholak/prepare-dataset
tscholak Nov 11, 2024
e51677f
simplify
tscholak Nov 12, 2024
1f447bb
simplify
tscholak Nov 12, 2024
fb50c13
address comments
tscholak Nov 12, 2024
33067c8
address comments
tscholak Nov 12, 2024
dbc221c
address comments
tscholak Nov 12, 2024
a2ae051
address comments
tscholak Nov 12, 2024
81162b3
fixes
jlamypoirier Nov 12, 2024
a134a52
fix
jlamypoirier Nov 12, 2024
fbb011a
No venv
jlamypoirier Nov 12, 2024
4827f49
Faster tests
jlamypoirier Nov 12, 2024
f8c328f
use dtype
tscholak Nov 13, 2024
ded3027
remove unused venv package
tscholak Nov 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Ignore everything by default
*

# Allow specific files and directories
!setup.py
!setup.cfg
!Megatron-LM
Expand All @@ -7,3 +10,7 @@
!tools
!tests
!pyproject.toml

# Exclude Python cache directories and shared object files within included directories
**/__pycache__/
**/*.so
8 changes: 2 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,9 @@ jobs:
ghcr.io/servicenow/fast-llm
tags: |
type=schedule
type=ref,event=branch
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=semver,pattern={{major}}
type=pep440,pattern={{version}}
type=sha
type=raw,value=latest,enabled={{github.ref == 'refs/heads/main'}}
type=raw,value=latest,enable={{is_default_branch}}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
Expand All @@ -78,7 +75,6 @@ jobs:
uses: docker/build-push-action@v6
with:
context: .
# push: ${{ github.event_name != 'pull_request' }}
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
Expand Down
51 changes: 28 additions & 23 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,34 +1,39 @@
# syntax=docker/dockerfile:1.7-labs
FROM nvcr.io/nvidia/pytorch:24.07-py3

# Install git-lfs for Huggingface hub interaction and sudo for system adjustments
# Install dependencies.
RUN apt-get update \
&& apt-get install --no-install-recommends -y git-lfs sudo util-linux \
&& apt-get install --no-install-recommends -y acl git-lfs \
&& rm -rf /var/lib/apt/lists/* \
&& git lfs install

# Add a user for Fast-LLM with sudo privileges for runtime adjustments
ARG FAST_LLM_USER_ID=1000
RUN useradd -m -u $FAST_LLM_USER_ID -s /bin/bash fast_llm \
&& echo 'fast_llm ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers

USER fast_llm
# Set the working directory.
WORKDIR /app
# Set the permission to 777 for all files and directories in `/app`, `/home` and python install directories:
# 1. Create directories explicitly because docker use the wrong permission for explicit creation.
# 2. For the rest, set the default ACL to 777 for all users.
RUN mkdir -m 777 /app/Megatron-LM /app/examples /app/fast_llm /app/tests /app/tools \
&& setfacl -m d:u::rwx,d:g::rwx,d:o::rwx,u::rwx,g::rwx,o::rwx \
/app \
/home \
/usr \
/usr/local \
/usr/local/bin \
/usr/local/lib \
/usr/local/lib/python3.10 \
/usr/local/lib/python3.10/dist-packages \
/usr/local/lib/python3.10/dist-packages/__pycache__

# Environment settings for Python and PATH
ENV PYTHONPATH=/app:/app/Megatron-LM \
PATH=$PATH:/home/fast_llm/.local/bin/

# Copy the dependency files and install dependencies
COPY --chown=fast_llm setup.py setup.cfg pyproject.toml ./
COPY --chown=fast_llm ./fast_llm/csrc/ fast_llm/csrc/
RUN PIP_NO_INPUT=1 pip3 install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"
# Copy dependency files with universal write permissions for all users.
COPY --chmod=777 setup.py setup.cfg pyproject.toml ./
COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/

# Copy the rest of the code
COPY --chown=fast_llm ./Megatron-LM Megatron-LM
COPY --chown=fast_llm ./examples examples
COPY --chown=fast_llm ./tests tests
COPY --chown=fast_llm ./tools tools
# Install dependencies within the virtual environment.
RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"

# Copy the main source code for Fast-LLM
COPY --exclude=./fast_llm/csrc/ --chown=fast_llm ./fast_llm/ fast_llm/
# Copy the remaining source code with universal write permissions.
COPY --chmod=777 ./Megatron-LM Megatron-LM
COPY --chmod=777 ./examples examples
COPY --chmod=777 ./tests tests
COPY --chmod=777 ./tools tools
COPY --chmod=777 --exclude=./fast_llm/csrc/ ./fast_llm/ fast_llm/
10 changes: 8 additions & 2 deletions fast_llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,15 +301,21 @@ def __setattr__(self, key, value):
# Allow setting the exact same object to facilitate setup of cross-dependencies.
# Ex. allow re-setting cross-dependencies of already validated sub-configs.
return
raise RuntimeError()
raise RuntimeError(
f"Cannot set attribute `{key}`"
f" in configuration class `{get_type_name(type(self))}` after validation."
)
super().__setattr__(key, value)

def __delattr__(self, key):
"""
Make the class read-only after validation.
"""
if getattr(self, "_validated", False):
raise RuntimeError()
raise RuntimeError(
f"Cannot delete attribute `{key}`"
f" in configuration class `{get_type_name(type(self))}` after validation."
)
super().__delattr__(key)

def validate(self, *, _is_validating=False):
Expand Down
12 changes: 12 additions & 0 deletions fast_llm/data/auto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from fast_llm.data.preparator.gpt_memmap.config import GPTMemmapDatasetPreparatorConfig
from fast_llm.utils import Registry

dataset_preparator_registry = Registry(
"DatasetPreparator",
{
dataset_preparator.preparator_name: dataset_preparator
for dataset_preparator in [
GPTMemmapDatasetPreparatorConfig,
]
},
)
2 changes: 1 addition & 1 deletion fast_llm/data/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def _validate(self):
class TokenizerConfig(Config):
"""
Configuration for the tokenizer.
Currently, the tokenizer is only needed for FIM.
The tokenizer is needed for FIM and dataset preparation.
"""

format: str = Field(
Expand Down
24 changes: 7 additions & 17 deletions fast_llm/data/gpt/memmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import numpy as np

from fast_llm.data.gpt.dataset import GPTIndexedDataset
from fast_llm.data.preparator.gpt_memmap.config import MEMMAP_DTYPES, MEMMAP_DTYPES_INV, MEMMAP_INDEX_HEADER
from fast_llm.engine.config_utils.data_type import DataType
from fast_llm.utils import Assert, div, padded_cumsum


Expand All @@ -16,18 +18,6 @@ class GPTMemmapDataset(GPTIndexedDataset):
See https://github.com/NVIDIA/Megatron-LM?tab=readme-ov-file#data-preprocessing for more details.
"""

_DTYPES = {
1: np.uint8,
2: np.int8,
3: np.int16,
4: np.int32,
5: np.int64,
6: np.float32,
7: np.float64,
8: np.uint16,
}
_INDEX_HEADER = b"MMIDIDX\x00\x00"

def __init__(self, name: str, prefix: pathlib.Path | str):
self._init(name, prefix)

Expand All @@ -37,10 +27,10 @@ def _init(self, name: str, prefix: pathlib.Path | str):
self._prefix = pathlib.Path(prefix)

with self._prefix.with_suffix(".idx").open("rb") as stream:
Assert.eq(stream.read(9), self._INDEX_HEADER)
Assert.eq(stream.read(9), MEMMAP_INDEX_HEADER)
Assert.eq(struct.unpack("<Q", stream.read(8))[0], 1)

self._dtype = self._DTYPES[struct.unpack("<B", stream.read(1))[0]]
self._dtype = MEMMAP_DTYPES[struct.unpack("<B", stream.read(1))[0]].numpy
self._num_documents = struct.unpack("<Q", stream.read(8))[0]
_ = struct.unpack("<Q", stream.read(8))[0]
offset = stream.tell()
Expand Down Expand Up @@ -106,13 +96,13 @@ def write_dataset(cls, prefix: pathlib.Path | str, documents: list[np.ndarray]):
dtype = documents[0].dtype
num_documents = len(documents)
lengths = np.array([len(document) for document in documents], dtype=np.int32)
pointers = padded_cumsum(lengths[:-1].astype(np.int64) * 2)
pointers = padded_cumsum(lengths[:-1].astype(np.int64)) * np.dtype(dtype).itemsize
prefix.parent.mkdir(parents=True, exist_ok=True)
with prefix.with_suffix(".idx").open("wb") as stream:
stream.write(cls._INDEX_HEADER)
stream.write(MEMMAP_INDEX_HEADER)
stream.write(struct.pack("<Q", 1))
# Data type
stream.write(struct.pack("<B", {y: x for x, y in cls._DTYPES.items()}[dtype.type]))
stream.write(struct.pack("<B", MEMMAP_DTYPES_INV[DataType.from_numpy(dtype.type)]))
# "Number of sequences", same as documents in our case.
stream.write(struct.pack("<Q", num_documents))
# "Number of documents", needs a +1 for some reason.
Expand Down
Empty file.
34 changes: 34 additions & 0 deletions fast_llm/data/preparator/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import abc
import argparse
import typing

from fast_llm.config import config_class
from fast_llm.engine.config_utils.runnable import RunnableConfig
from fast_llm.utils import Assert


@config_class()
class DatasetPreparatorConfig(RunnableConfig):
preparator_name: typing.ClassVar[str]

@classmethod
def get_dataset_preparator_class(cls) -> type["DatasetPreparator"]:
raise NotImplementedError

def _get_runnable(self, parsed: argparse.Namespace) -> typing.Callable[[], None]:
dataset_preparator = self.get_dataset_preparator_class()(config=self)
return dataset_preparator.run


class DatasetPreparator(abc.ABC):
_config: DatasetPreparatorConfig
config_class: typing.ClassVar[type[DatasetPreparatorConfig]] = DatasetPreparatorConfig

def __init__(self, config: DatasetPreparatorConfig) -> None:
Assert.custom(isinstance, config, self.config_class)
config.validate()
self._config = config

@abc.abstractmethod
def run(self) -> None:
raise NotImplementedError
Empty file.
Loading