LiangliangMa
commited on
Use try-except for flash_attn import
Browse filesThis PR is to avoid transformers hard check failure on import dependencies.
If user doesn't install flash_attn, they will not pass the import check for flash_attn. Also same for non-cuda users.
To solve this, we can see that try-except content will be filtered out: https://github.com/huggingface/transformers/blob/main/src/transformers/dynamic_module_utils.py#L155.
- modeling_deepseek.py +3 -28
modeling_deepseek.py
CHANGED
@@ -48,7 +48,6 @@ from transformers.pytorch_utils import (
|
|
48 |
from transformers.utils import (
|
49 |
add_start_docstrings,
|
50 |
add_start_docstrings_to_model_forward,
|
51 |
-
is_flash_attn_2_available,
|
52 |
is_flash_attn_greater_or_equal_2_10,
|
53 |
logging,
|
54 |
replace_return_docstrings,
|
@@ -58,10 +57,11 @@ from .configuration_deepseek import DeepseekV2Config
|
|
58 |
import torch.distributed as dist
|
59 |
import numpy as np
|
60 |
|
61 |
-
|
62 |
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
63 |
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
|
64 |
-
|
|
|
65 |
|
66 |
# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
|
67 |
# It means that the function will not be traced through and simply appear as a node in the graph.
|
@@ -338,7 +338,6 @@ def rotate_half(x):
|
|
338 |
# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
|
339 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
340 |
"""Applies Rotary Position Embedding to the query and key tensors.
|
341 |
-
|
342 |
Args:
|
343 |
q (`torch.Tensor`): The query tensor.
|
344 |
k (`torch.Tensor`): The key tensor.
|
@@ -1076,7 +1075,6 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention):
|
|
1076 |
"""
|
1077 |
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
|
1078 |
first unpad the input, then computes the attention scores and pad the final attention scores.
|
1079 |
-
|
1080 |
Args:
|
1081 |
query_states (`torch.Tensor`):
|
1082 |
Input query states to be passed to Flash Attention API
|
@@ -1287,11 +1285,9 @@ DeepseekV2_START_DOCSTRING = r"""
|
|
1287 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
1288 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
1289 |
etc.)
|
1290 |
-
|
1291 |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
1292 |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
1293 |
and behavior.
|
1294 |
-
|
1295 |
Parameters:
|
1296 |
config ([`DeepseekV2Config`]):
|
1297 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
@@ -1330,50 +1326,38 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
|
|
1330 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
1331 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
1332 |
it.
|
1333 |
-
|
1334 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
1335 |
[`PreTrainedTokenizer.__call__`] for details.
|
1336 |
-
|
1337 |
[What are input IDs?](../glossary#input-ids)
|
1338 |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1339 |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
1340 |
-
|
1341 |
- 1 for tokens that are **not masked**,
|
1342 |
- 0 for tokens that are **masked**.
|
1343 |
-
|
1344 |
[What are attention masks?](../glossary#attention-mask)
|
1345 |
-
|
1346 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
1347 |
[`PreTrainedTokenizer.__call__`] for details.
|
1348 |
-
|
1349 |
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
1350 |
`past_key_values`).
|
1351 |
-
|
1352 |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
1353 |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
1354 |
information on the default strategy.
|
1355 |
-
|
1356 |
- 1 indicates the head is **not masked**,
|
1357 |
- 0 indicates the head is **masked**.
|
1358 |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1359 |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
1360 |
config.n_positions - 1]`.
|
1361 |
-
|
1362 |
[What are position IDs?](../glossary#position-ids)
|
1363 |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
|
1364 |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
1365 |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
1366 |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
1367 |
-
|
1368 |
Two formats are allowed:
|
1369 |
- a [`~cache_utils.Cache`] instance;
|
1370 |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
1371 |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
|
1372 |
cache format.
|
1373 |
-
|
1374 |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
|
1375 |
legacy cache format will be returned.
|
1376 |
-
|
1377 |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
1378 |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
1379 |
of shape `(batch_size, sequence_length)`.
|
@@ -1402,7 +1386,6 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
|
|
1402 |
class DeepseekV2Model(DeepseekV2PreTrainedModel):
|
1403 |
"""
|
1404 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
|
1405 |
-
|
1406 |
Args:
|
1407 |
config: DeepseekV2Config
|
1408 |
"""
|
@@ -1638,20 +1621,14 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
|
|
1638 |
Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
|
1639 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1640 |
(masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
|
1641 |
-
|
1642 |
Returns:
|
1643 |
-
|
1644 |
Example:
|
1645 |
-
|
1646 |
```python
|
1647 |
>>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
|
1648 |
-
|
1649 |
>>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
1650 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
1651 |
-
|
1652 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
1653 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
1654 |
-
|
1655 |
>>> # Generate
|
1656 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
1657 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
@@ -1793,10 +1770,8 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
|
|
1793 |
@add_start_docstrings(
|
1794 |
"""
|
1795 |
The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
|
1796 |
-
|
1797 |
[`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
1798 |
(e.g. GPT-2) do.
|
1799 |
-
|
1800 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
1801 |
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
|
1802 |
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
|
|
|
48 |
from transformers.utils import (
|
49 |
add_start_docstrings,
|
50 |
add_start_docstrings_to_model_forward,
|
|
|
51 |
is_flash_attn_greater_or_equal_2_10,
|
52 |
logging,
|
53 |
replace_return_docstrings,
|
|
|
57 |
import torch.distributed as dist
|
58 |
import numpy as np
|
59 |
|
60 |
+
try:
|
61 |
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
62 |
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
|
63 |
+
except ImportError:
|
64 |
+
pass
|
65 |
|
66 |
# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
|
67 |
# It means that the function will not be traced through and simply appear as a node in the graph.
|
|
|
338 |
# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
|
339 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
340 |
"""Applies Rotary Position Embedding to the query and key tensors.
|
|
|
341 |
Args:
|
342 |
q (`torch.Tensor`): The query tensor.
|
343 |
k (`torch.Tensor`): The key tensor.
|
|
|
1075 |
"""
|
1076 |
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
|
1077 |
first unpad the input, then computes the attention scores and pad the final attention scores.
|
|
|
1078 |
Args:
|
1079 |
query_states (`torch.Tensor`):
|
1080 |
Input query states to be passed to Flash Attention API
|
|
|
1285 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
1286 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
1287 |
etc.)
|
|
|
1288 |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
1289 |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
1290 |
and behavior.
|
|
|
1291 |
Parameters:
|
1292 |
config ([`DeepseekV2Config`]):
|
1293 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
|
1326 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
1327 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
1328 |
it.
|
|
|
1329 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
1330 |
[`PreTrainedTokenizer.__call__`] for details.
|
|
|
1331 |
[What are input IDs?](../glossary#input-ids)
|
1332 |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1333 |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
|
1334 |
- 1 for tokens that are **not masked**,
|
1335 |
- 0 for tokens that are **masked**.
|
|
|
1336 |
[What are attention masks?](../glossary#attention-mask)
|
|
|
1337 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
1338 |
[`PreTrainedTokenizer.__call__`] for details.
|
|
|
1339 |
If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
|
1340 |
`past_key_values`).
|
|
|
1341 |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
1342 |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
1343 |
information on the default strategy.
|
|
|
1344 |
- 1 indicates the head is **not masked**,
|
1345 |
- 0 indicates the head is **masked**.
|
1346 |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1347 |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
1348 |
config.n_positions - 1]`.
|
|
|
1349 |
[What are position IDs?](../glossary#position-ids)
|
1350 |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
|
1351 |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
1352 |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
1353 |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
|
|
1354 |
Two formats are allowed:
|
1355 |
- a [`~cache_utils.Cache`] instance;
|
1356 |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
1357 |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
|
1358 |
cache format.
|
|
|
1359 |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
|
1360 |
legacy cache format will be returned.
|
|
|
1361 |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
1362 |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
1363 |
of shape `(batch_size, sequence_length)`.
|
|
|
1386 |
class DeepseekV2Model(DeepseekV2PreTrainedModel):
|
1387 |
"""
|
1388 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
|
|
|
1389 |
Args:
|
1390 |
config: DeepseekV2Config
|
1391 |
"""
|
|
|
1621 |
Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
|
1622 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1623 |
(masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
|
|
|
1624 |
Returns:
|
|
|
1625 |
Example:
|
|
|
1626 |
```python
|
1627 |
>>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
|
|
|
1628 |
>>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
1629 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
|
|
1630 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
1631 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
|
|
1632 |
>>> # Generate
|
1633 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
1634 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
|
1770 |
@add_start_docstrings(
|
1771 |
"""
|
1772 |
The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
|
|
|
1773 |
[`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
1774 |
(e.g. GPT-2) do.
|
|
|
1775 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
1776 |
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
|
1777 |
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
|