Skip to content

Commit

Permalink
address review
Browse files Browse the repository at this point in the history
  • Loading branch information
fxmarty committed Jun 7, 2024
1 parent 4220423 commit b884d2b
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 90 deletions.
12 changes: 6 additions & 6 deletions launcher/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ fn shard_manager(
rope_factor: Option<f32>,
max_total_tokens: usize,
max_batch_size: Option<usize>,
max_batch_prefill_tokens: u32,
max_input_tokens: usize,
otlp_endpoint: Option<String>,
log_level: LevelFilter,
status_sender: mpsc::Sender<ShardStatus>,
Expand Down Expand Up @@ -550,8 +550,8 @@ fn shard_manager(
}

// In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter.
shard_args.push("--max-batch-prefill-tokens".to_string());
shard_args.push(max_batch_prefill_tokens.to_string());
shard_args.push("--max-input-tokens".to_string());
shard_args.push(max_input_tokens.to_string());

// Copy current process env
let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();
Expand Down Expand Up @@ -1009,7 +1009,7 @@ fn spawn_shards(
args: &Args,
cuda_graphs: Vec<usize>,
max_total_tokens: usize,
max_batch_prefill_tokens: u32,
max_input_tokens: usize,
max_log_level: LevelFilter,
shutdown: Arc<AtomicBool>,
shutdown_receiver: &mpsc::Receiver<()>,
Expand Down Expand Up @@ -1067,7 +1067,7 @@ fn spawn_shards(
rope_factor,
max_total_tokens,
max_batch_size,
max_batch_prefill_tokens,
max_input_tokens,
otlp_endpoint,
max_log_level,
status_sender,
Expand Down Expand Up @@ -1542,7 +1542,7 @@ fn main() -> Result<(), LauncherError> {
&args,
cuda_graphs,
max_total_tokens,
max_batch_prefill_tokens,
max_input_tokens,
max_log_level,
shutdown.clone(),
&shutdown_receiver,
Expand Down
4 changes: 2 additions & 2 deletions server/text_generation_server/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def serve(
logger_level: str = "INFO",
json_output: bool = False,
otlp_endpoint: Optional[str] = None,
max_batch_prefill_tokens: Optional[int] = None,
max_input_tokens: Optional[int] = None,
):
if sharded:
assert (
Expand Down Expand Up @@ -98,7 +98,7 @@ def serve(
dtype,
trust_remote_code,
uds_path,
max_batch_prefill_tokens,
max_input_tokens,
)


Expand Down
11 changes: 3 additions & 8 deletions server/text_generation_server/layers/attention/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,8 @@ def attention(
):
if window_size_left <= 0 and window_size_left != -1:
raise ValueError("`window_size_left` must be > 0 or -1")
if window_size_left != -1 and q.shape[0] > window_size_left:
raise ValueError(
f"ROCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
)

# We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
return flash_attn_2_cuda.varlen_fwd(
q,
k,
Expand Down Expand Up @@ -204,10 +202,7 @@ def attention(
window_size_left=-1,
causal=True,
):
if window_size_left != -1 and q.shape[0] > window_size_left:
raise ValueError(
f"RoCm version of Flash Attention v2 does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
)
# We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
output, _ = triton_attention(
q,
k,
Expand Down
5 changes: 1 addition & 4 deletions server/text_generation_server/layers/attention/xpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@ def attention(
softmax_scale,
window_size_left=-1,
):
if window_size_left != -1 and q.shape[0] > window_size_left:
raise ValueError(
f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
)
# We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
return ipex.llm.functional.varlen_attention(
q,
k,
Expand Down
81 changes: 13 additions & 68 deletions server/text_generation_server/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@

FLASH_ATT_ERROR_MESSAGE = "{} requires Flash Attention enabled models."

SLIDING_WINDOW_MESSAGE = "The backend {} does not support sliding window attention. TGI webserver was started max_batch_prefill_tokens={} larger than sliding_window={}. To use this model with the {} backend, please launch TGI with the argument `--max-batch-prefill-tokens` smaller than {}."
SLIDING_WINDOW_MESSAGE = "The backend {} does not support sliding window attention. TGI webserver was started max_input_tokens={} larger than sliding_window={}. To use this model with the {} backend, please launch TGI with the argument `--max-batch-prefill-tokens` smaller than {}."

FLASH_ATTENTION = True

Expand Down Expand Up @@ -261,7 +261,7 @@ def get_model(
speculate: Optional[int],
dtype: Optional[str],
trust_remote_code: bool,
max_batch_prefill_tokens: int,
max_input_tokens: int,
) -> Model:
global FLASH_ATTENTION
if dtype is None:
Expand Down Expand Up @@ -416,9 +416,13 @@ def get_model(
)
sliding_window = config_dict.get("sliding_window", -1)

if sliding_window != -1 and not SUPPORTS_WINDOWING:
logger.warning(
f"Flash attention is available, but on the backend {SYSTEM} doesn't support sliding window attention, which is required by model {model_id} for long contexts."
if (
(sliding_window is not None and sliding_window != -1)
and not SUPPORTS_WINDOWING
and max_input_tokens > sliding_window
):
raise ValueError(
f"The backend {SYSTEM} does not support sliding window attention. TGI webserver was started max_input_tokens={max_input_tokens} larger than sliding_window={sliding_window}. To use this model with the {SYSTEM} backend, please launch TGI with the argument `--max-batch-prefill-tokens` smaller than {sliding_window}."
)

if model_type == MAMBA:
Expand Down Expand Up @@ -706,22 +710,7 @@ def get_model(
)

if model_type == MISTRAL:
sliding_window = config_dict.get("sliding_window", -1)
if (
(sliding_window is not None and sliding_window != -1)
and not SUPPORTS_WINDOWING
and max_batch_prefill_tokens > sliding_window
):
raise ValueError(
SLIDING_WINDOW_MESSAGE.format(
SYSTEM,
max_batch_prefill_tokens,
sliding_window,
SYSTEM,
sliding_window,
)
)
elif FLASH_ATTENTION:
if FLASH_ATTENTION:
return FlashMistral(
model_id,
revision,
Expand All @@ -743,22 +732,7 @@ def get_model(
)

if model_type == MIXTRAL:
sliding_window = config_dict.get("sliding_window", -1)
if (
(sliding_window is not None and sliding_window != -1)
and not SUPPORTS_WINDOWING
and max_batch_prefill_tokens > sliding_window
):
raise ValueError(
SLIDING_WINDOW_MESSAGE.format(
SYSTEM,
max_batch_prefill_tokens,
sliding_window,
SYSTEM,
sliding_window,
)
)
elif FLASH_ATTENTION:
if FLASH_ATTENTION:
return FlashMixtral(
model_id,
revision,
Expand All @@ -780,21 +754,7 @@ def get_model(
)

if model_type == STARCODER2:
if (
(sliding_window is not None and sliding_window != -1)
and not SUPPORTS_WINDOWING
and max_batch_prefill_tokens > sliding_window
):
raise ValueError(
SLIDING_WINDOW_MESSAGE.format(
SYSTEM,
max_batch_prefill_tokens,
sliding_window,
SYSTEM,
sliding_window,
)
)
elif FLASH_ATTENTION:
if FLASH_ATTENTION:
return FlashStarcoder2(
model_id,
revision,
Expand All @@ -817,22 +777,7 @@ def get_model(
)

if model_type == QWEN2:
sliding_window = config_dict.get("sliding_window", -1)
if (
(sliding_window is not None and sliding_window != -1)
and not SUPPORTS_WINDOWING
and max_batch_prefill_tokens > sliding_window
):
raise ValueError(
SLIDING_WINDOW_MESSAGE.format(
SYSTEM,
max_batch_prefill_tokens,
sliding_window,
SYSTEM,
sliding_window,
)
)
elif sliding_window is None or sliding_window != -1:
if FLASH_ATTENTION:
return FlashQwen2(
model_id,
revision,
Expand Down
4 changes: 2 additions & 2 deletions server/text_generation_server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def serve(
dtype: Optional[str],
trust_remote_code: bool,
uds_path: Path,
max_batch_prefill_tokens: int,
max_input_tokens: int,
):
async def serve_inner(
model_id: str,
Expand Down Expand Up @@ -230,7 +230,7 @@ async def serve_inner(
speculate,
dtype,
trust_remote_code,
max_batch_prefill_tokens,
max_input_tokens,
)
except Exception:
logger.exception("Error when initializing model")
Expand Down

0 comments on commit b884d2b

Please sign in to comment.