StrongSpoon
diff --git a/‎.gitignore
+132 b/‎.gitignore
+132
diff --git a/‎LICENSE
+13 b/‎LICENSE
+13
diff --git a/‎README.md
+102 b/‎README.md
+102
diff --git a/‎assets/piecewise_attention.png
1.44 MB b/‎assets/piecewise_attention.png
1.44 MB
diff --git a/‎benchmark/piecewise_benchmark.py
+91 b/‎benchmark/piecewise_benchmark.py
+91
diff --git a/‎pyproject.toml
+52 b/‎pyproject.toml
+52
@@ -0,0 +1,132 @@
+# results of benchmark
+benchmark/results/
+
+# version, since setuptools-scm is used, this file is automatic generated when building the package
+src/flag_attn/_version.py
+
+# Editors
+.vscode/
+.idea/
+
+# Vagrant
+.vagrant/
+
+# Mac/OSX
+.DS_Store
+
+# Windows
+Thumbs.db
+
+# Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
@@ -0,0 +1,13 @@
+Copyright 2023 BAAI
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
@@ -0,0 +1,102 @@
+# FlagAttention
+
+FlagAttention is a project for memory-efficient attention operators implemented in the Triton language. It is inspired by [FlashAttention](https://arxiv.org/abs/2205.14135) and [FlahAttention v2](https://tridao.me/publications/flash2/flash2.pdf) and extends them to satisfy the needs for research on large language modeling. FlashAttention and FlashAttention-2 save memory footprint and traffic to improve memory efficiency, but to modify them and add more options and functionalities requires precision in cuda programming. Thus, Flag Attention is implemented in the Triton language, which is easier to use to write custom GPU kernels.
+
+## Installation
+
+## Requirements
+
+Flag Attention requires torch and triton. To use new features of triton, triton nightly is recommended.
+
+Instructions for installing torch nightly can be found at https://pytorch.org/get-started/locally/ . Triton is now a dependency of torch nightly, so it can be installed automatically.
+
+Flag Attention requires Ampere Nvidia GPUs(e.g. A100, RTX-3090, ...) and CUDA Toolkit 11.6 and above. Other GPUs may work but not been tested yet.
+
+FlagAttention can be installed in either way below.
+
+1. Editable Installation. This includes tests and benchmarks. Changes to the code in local source tree are effective without re-installation.
+2. Build a distribution and then install. Only the package is installed.
+
+### Editable Installation
+
+Editable installation with `pip`.
+
+```sh
+git clone https://github.com/FlagOpen/FlagAttention && cd FlagAttention
+pip install -e .
+```
+
+### Build a Distribution & Install
+
+Following modern python packaging convention, `FlagAttention` is configured by [`pyproject.toml`](https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/), and no `setup.py` is provided. To build a distribution, either a source distribution or a binary distribution, python module `build` is recommended.
+
+First, install `build` package via pip.
+
+```sh
+pip install build
+```
+
+Then build the package.
+
+```sh
+git clone https://github.com/FlagOpen/FlagAttention && cd FlagAttention
+python -m build --no-isolation
+```
+
+The built package is in `dist/` for installation.
+
+```sh
+pip install dist/flag_attn-xxx.whl
+```
+
+## Usage
+
+FlagAttention provides customized attention operators. When an operator is equivalent to a torch function, it can be used as a drop-in replacement.
+
+## Run the Tests
+
+A recent version of `pytest`(>=7.1.0) is required to run the tests in `tests`. Operators in `FlagAttention` are tested against a reference implementation in pytorch, both forward and backward. For `float16` and `bfloat16`. we set absolute and relative tolerance to `1e-2` and `1e-3`, respectively.
+
+```sh
+pytest .
+```
+
+## Run the Benchmark
+
+Benchmarks are provided to measure the TFLOPs/s achieved. Since operators in `FlagAttention` deviates from flash attention, the total amount of computation is different even when batch size, sequence length, number of heads, head dimension, and other configurations are the same. To calculate the FLOPs of an operator, only matmuls are counted. The FLOPs is divided by the median runtime to get the achieved FLOPs/s.
+
+## Operators
+
+### Piecewise Attention
+
+The first extension of flash attention is [piecewise attention](src/flag_attn/piecewise.py).
+
+```
+piecewise_attention(q1, k1, q2, k2, v, dist_threshold, softmax_scale=None, causal=False)
+```
+
+It is named `piecewise_attention` in that it takes two `q`'s and two `k`'s to compute attention scores (S) before applying softmax to get the attention weights (P). The design originates from the fact that a transformer with rotary position embedding is not good at predicting sequences longer than the longest sequence that it is trained on. Pair of (q, k) gets unexpectedly high attention scores when the distance is greater the max sequence length in traing set. A proposal to solve the problem is to compute the attention score in different ways, depending on whether the distance between `q` and `k` is greater than a threshold.
+
+In practice, `q` and `k` can be preprocessed in two different ways to get `q1, q2` and `k1, k2`. Then then attention score is computed as the dot product of `q1, k1` or `q2, k2` depending on the distance between `q` and `k`.
+
+![piecewise attention](assets/piecewise_attention.png)
+
+Features:
+
+- the sequence length of k/v can be larger than that of q;
+- data type support, float16 and bfloat16 for Ampere Nvidia GPUs;
+- support causal and non causal modes.
+- support forward & backward modes.
+
+Limitations:
+
+- headdim should be in `[16, 32, 64, 128]`.
+- dropout is not supported yet.
+
+## TODOs
+
+1. Test on other GPUs;
+2. Test on more triton versions 
+3. Improve performance of attention operators.
+2. Support other extensions of flash attention.
+
@@ -0,0 +1,91 @@
+import math
+import pathlib
+import torch
+import triton
+
+from flag_attn import piecewise_attn
+
+try:
+    from flash_attn.flash_attn_interface import \
+        flash_attn_qkvpacked_func as flash_attn_func
+    FLASH_VER = 2
+except BaseException:
+    try:
+        from flash_attn.flash_attn_interface import flash_attn_func
+        FLASH_VER = 1
+    except BaseException:
+        FLASH_VER = None
+HAS_FLASH = FLASH_VER is not None
+
+BATCH, N_HEADS, N_CTX, D_HEAD = 4, 48, 4096, 64
+
+configs = [triton.testing.Benchmark(
+    x_names=['N_CTX'],
+    x_vals=[2**i for i in range(10, 16)],
+    line_arg='provider',
+    line_vals=['triton', ] + (['flash'] if HAS_FLASH else []),
+    line_names=['triton', ] + ([f'flash-{FLASH_VER}'] if HAS_FLASH else []),
+    styles=[('red', '-'), ('green', '-')],
+    ylabel='tflop/s',
+    plot_name=f'piecewise_attention_batch-{BATCH}_head-{N_HEADS}_d-{D_HEAD}_mode-{mode}_caucal-{causal}_dtype-{dtype}',
+    args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'dtype': dtype, 'mode': mode, 'causal': causal}
+) for mode in ['fwd', 'bwd'] for causal in [False, True] for dtype in [torch.float16, torch.bfloat16]] 
+
+@triton.testing.perf_report(configs)
+def bench_flash_attention(BATCH, H, N_CTX, D_HEAD, causal, mode, provider, dtype=torch.float16, device="cuda"):
+    assert mode in ['fwd', 'bwd']
+    w = N_CTX // 2 # dist thresold
+    warmup = 25
+    rep = 100
+    if provider == "triton":
+        q1 = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
+        k1 = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
+        q2 = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
+        k2 = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
+        v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda", requires_grad=True)
+        sm_scale = 1. / math.sqrt(D_HEAD)
+        fn = lambda: piecewise_attn(q1, k1, q2, k2, v, w, causal, sm_scale)
+        if mode == 'bwd':
+            o = fn()
+            do = torch.randn_like(o)
+            fn = lambda: o.backward(do, retain_graph=True)
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+    if provider == "flash":
+        qkv = torch.randn((BATCH, N_CTX, 3, H, D_HEAD), dtype=dtype, device=device, requires_grad=True)
+        if FLASH_VER == 1:
+            lengths = torch.full((BATCH,), fill_value=N_CTX, device=device)
+            cu_seqlens = torch.zeros((BATCH + 1,), device=device, dtype=torch.int32)
+            cu_seqlens[1:] = lengths.cumsum(0)
+            qkv = qkv.reshape(BATCH * N_CTX, 3, H, D_HEAD)
+            fn = lambda: flash_attn_func(qkv, cu_seqlens, 0., N_CTX, causal=causal)
+        elif FLASH_VER == 2:
+            fn = lambda: flash_attn_func(qkv, causal=causal)
+        else:
+            raise ValueError(f'unknown {FLASH_VER = }')
+        if mode == 'bwd':
+            o = fn()
+            do = torch.randn_like(o)
+            fn = lambda: o.backward(do, retain_graph=True)
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+
+    # total TFLOPS: following Flash Attention v2, only gemms are counted.
+    # NOTE: It is not a fair play here, the total amount of flops and the elapsed time are different,
+    # the tflop/s is a used as a metric of the performance of the operator, for refernce only.
+    if provider == "flash":
+        macs = 2. * BATCH * H * N_CTX * N_CTX * D_HEAD # Q@K, P@V
+        if mode == 'bwd':
+            macs *= 2.5  # Q@K, dO@V, dO@P, dS@Q dS@K 
+    else:
+        macs = 3. * BATCH * H * N_CTX * N_CTX * D_HEAD # Q1@K1, Q2@K2, P@V
+        if mode == 'bwd':
+            macs *= 8/3.0 # Q1@K1, Q2@K2, dO@V, dO@P, dS1@@Q1, dS1@K1, dS2@@Q2, dS2@K2
+    total_flops = 2 * macs
+
+    if causal:
+        total_flops *= 0.5
+    return total_flops / ms * 1e-9
+
+# only works on post-Ampere GPUs right now
+output_dir = pathlib.Path("results")
+output_dir.mkdir(exist_ok=True)
+bench_flash_attention.run(save_path=output_dir, print_data=True)
@@ -0,0 +1,52 @@
+[project]
+name = "flag_attn"
+dynamic = ["version"]
+authors = [
+    {name = "Chen Feiyu", email = "[email protected]"},
+]
+description = "A collection of memory efficient attention operators implemented in triton language."
+readme = {file= "README.md", content-type="text/markdown"}
+requires-python = ">=3.7"
+license = {text = "LICENSE.txt"}
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+dependencies = [
+    "torch>=2.1.0",
+]
+
+[project.optional-dependencies]
+test = [
+    "pytest>=7.1.0",
+]
+
+[project.urls]
+homepage = "https://github.com/FlagOpen/FlagAttention"
+
+
+[build-system]
+requires = ["setuptools>=60", "setuptools-scm>=8.0"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools_scm]
+version_file = "src/flag_attn/_version.py"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["flag_attn"]
+namespaces = false
+
+# helps for setting up pytest in pyprojects 
+# https://docs.pytest.org/en/7.3.x/reference/customize.html#rootdir
+# https://docs.pytest.org/en/7.3.x/reference/reference.html#confval-pythonpath
+[tool.pytest.ini_options]
+testpaths = [
+    "tests",
+]
+pythonpath = [
+    "src",
+    "tests/flag_attn",
+]
+