Support gradient checkpointing
#3
by
maxall4
- opened
- config.json +1 -1
- model.py +18 -7
- modeling_hyena.py +25 -0
config.json
CHANGED
@@ -87,4 +87,4 @@
|
|
87 |
"use_flashfft": false,
|
88 |
"use_interpolated_rotary_pos_emb": true,
|
89 |
"vocab_size": 512
|
90 |
-
}
|
|
|
87 |
"use_flashfft": false,
|
88 |
"use_interpolated_rotary_pos_emb": true,
|
89 |
"vocab_size": 512
|
90 |
+
}
|
model.py
CHANGED
@@ -22,6 +22,8 @@ try:
|
|
22 |
except ImportError:
|
23 |
"could not import swap_mha_rope from positional_embeddings.py"
|
24 |
|
|
|
|
|
25 |
# dummy import to force huggingface to bundle the tokenizer
|
26 |
from .tokenizer import ByteTokenizer
|
27 |
|
@@ -64,6 +66,7 @@ class AttentionBlock(nn.Module):
|
|
64 |
self.inner_mha_cls.rotary_emb.register_buffer("inv_freq", self.inner_mha_cls.rotary_emb.inv_freq)
|
65 |
|
66 |
self.mlp = ParallelGatedMLP(config).to(dtype=mlp_dtype)
|
|
|
67 |
|
68 |
def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
|
69 |
if (
|
@@ -71,13 +74,12 @@ class AttentionBlock(nn.Module):
|
|
71 |
): # workaround for masking bug in FA. This works because Wqkv does not have bias
|
72 |
# and attention scores will be also automatically zeroed.
|
73 |
u = u * padding_mask[..., None]
|
74 |
-
|
75 |
-
self.inner_mha_cls(
|
76 |
self.pre_norm(u),
|
77 |
inference_params=inference_params,
|
78 |
-
)
|
79 |
-
+ u
|
80 |
)
|
|
|
|
|
81 |
if type(padding_mask) == torch.Tensor: # guard against bias
|
82 |
u = u * padding_mask[..., None]
|
83 |
u = self.mlp(self.post_norm(u)) + u
|
@@ -120,7 +122,7 @@ class ParallelHyenaFilter(nn.Module):
|
|
120 |
self.data_dtype = None
|
121 |
|
122 |
if self.use_flash_depthwise:
|
123 |
-
self.fir_fn =
|
124 |
channels=3 * self.hidden_size,
|
125 |
kernel_size=self.short_filter_length,
|
126 |
padding=self.short_filter_length - 1,
|
@@ -287,6 +289,7 @@ class ParallelGatedConvBlock(nn.Module):
|
|
287 |
|
288 |
self.proj_norm_fn = self.proj_norm
|
289 |
self.res_mlp_norm_fn = self.res_mlp_norm
|
|
|
290 |
|
291 |
if self.config.get("compile", False):
|
292 |
self.proj_norm_fn = torch.compile(self.proj_norm, fullgraph=True, dynamic=False, mode="reduce-overhead")
|
@@ -308,6 +311,8 @@ class ParallelGatedConvBlock(nn.Module):
|
|
308 |
|
309 |
z, inference_params = self.filter(z, inference_params=inference_params, padding_mask=padding_mask)
|
310 |
|
|
|
|
|
311 |
z_in = self.out_filter_dense(z) + u
|
312 |
|
313 |
if type(padding_mask) == torch.Tensor: # guard against bias
|
@@ -343,13 +348,15 @@ class StripedHyena(nn.Module):
|
|
343 |
from flashfftconv import FlashFFTConv
|
344 |
except:
|
345 |
raise ImportError
|
346 |
-
self.flash_fft = FlashFFTConv(2 * config.
|
347 |
else:
|
348 |
self.flash_fft = None
|
349 |
|
350 |
self.blocks = nn.ModuleList(
|
351 |
get_block(config, layer_idx, flash_fft=self.flash_fft) for layer_idx in range(config.num_layers)
|
352 |
)
|
|
|
|
|
353 |
|
354 |
def forward(self, x, inference_params_dict=None, padding_mask=None):
|
355 |
L = x.shape[1]
|
@@ -379,7 +386,11 @@ class StripedHyena(nn.Module):
|
|
379 |
x = x * padding_mask[..., None]
|
380 |
|
381 |
for _, block in enumerate(self.blocks):
|
382 |
-
|
|
|
|
|
|
|
|
|
383 |
return x, None
|
384 |
|
385 |
def initialize_inference_params(self):
|
|
|
22 |
except ImportError:
|
23 |
"could not import swap_mha_rope from positional_embeddings.py"
|
24 |
|
25 |
+
from flashfftconv import FlashDepthWiseConv1d
|
26 |
+
|
27 |
# dummy import to force huggingface to bundle the tokenizer
|
28 |
from .tokenizer import ByteTokenizer
|
29 |
|
|
|
66 |
self.inner_mha_cls.rotary_emb.register_buffer("inv_freq", self.inner_mha_cls.rotary_emb.inv_freq)
|
67 |
|
68 |
self.mlp = ParallelGatedMLP(config).to(dtype=mlp_dtype)
|
69 |
+
self.filter_output = None
|
70 |
|
71 |
def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
|
72 |
if (
|
|
|
74 |
): # workaround for masking bug in FA. This works because Wqkv does not have bias
|
75 |
# and attention scores will be also automatically zeroed.
|
76 |
u = u * padding_mask[..., None]
|
77 |
+
w = self.inner_mha_cls(
|
|
|
78 |
self.pre_norm(u),
|
79 |
inference_params=inference_params,
|
|
|
|
|
80 |
)
|
81 |
+
self.filter_output = w
|
82 |
+
u = w + u
|
83 |
if type(padding_mask) == torch.Tensor: # guard against bias
|
84 |
u = u * padding_mask[..., None]
|
85 |
u = self.mlp(self.post_norm(u)) + u
|
|
|
122 |
self.data_dtype = None
|
123 |
|
124 |
if self.use_flash_depthwise:
|
125 |
+
self.fir_fn = FlashDepthWiseConv1d(
|
126 |
channels=3 * self.hidden_size,
|
127 |
kernel_size=self.short_filter_length,
|
128 |
padding=self.short_filter_length - 1,
|
|
|
289 |
|
290 |
self.proj_norm_fn = self.proj_norm
|
291 |
self.res_mlp_norm_fn = self.res_mlp_norm
|
292 |
+
self.filter_output = None
|
293 |
|
294 |
if self.config.get("compile", False):
|
295 |
self.proj_norm_fn = torch.compile(self.proj_norm, fullgraph=True, dynamic=False, mode="reduce-overhead")
|
|
|
311 |
|
312 |
z, inference_params = self.filter(z, inference_params=inference_params, padding_mask=padding_mask)
|
313 |
|
314 |
+
self.filter_output = z
|
315 |
+
|
316 |
z_in = self.out_filter_dense(z) + u
|
317 |
|
318 |
if type(padding_mask) == torch.Tensor: # guard against bias
|
|
|
348 |
from flashfftconv import FlashFFTConv
|
349 |
except:
|
350 |
raise ImportError
|
351 |
+
self.flash_fft = FlashFFTConv(2 * config.max_seqlen, dtype=torch.bfloat16)
|
352 |
else:
|
353 |
self.flash_fft = None
|
354 |
|
355 |
self.blocks = nn.ModuleList(
|
356 |
get_block(config, layer_idx, flash_fft=self.flash_fft) for layer_idx in range(config.num_layers)
|
357 |
)
|
358 |
+
self.gradient_checkpointing = False
|
359 |
+
self._gradient_checkpointing_func = None
|
360 |
|
361 |
def forward(self, x, inference_params_dict=None, padding_mask=None):
|
362 |
L = x.shape[1]
|
|
|
386 |
x = x * padding_mask[..., None]
|
387 |
|
388 |
for _, block in enumerate(self.blocks):
|
389 |
+
if self.gradient_checkpointing and self.training:
|
390 |
+
x, _ = self._gradient_checkpointing_func(block.__call__, x, None, padding_mask)
|
391 |
+
else:
|
392 |
+
x, _ = block(x, inference_params=None, padding_mask=padding_mask)
|
393 |
+
|
394 |
return x, None
|
395 |
|
396 |
def initialize_inference_params(self):
|
modeling_hyena.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
"""StripedHyena custom code port for the Hugging Face Hub"""
|
3 |
|
4 |
import torch
|
|
|
5 |
from torch.nn import functional as F
|
6 |
from .configuration_hyena import StripedHyenaConfig
|
7 |
from transformers import PreTrainedModel
|
@@ -50,8 +51,32 @@ class StripedHyenaModelForCausalLM(StripedHyenaPreTrainedModel):
|
|
50 |
def force_dtype(self):
|
51 |
self.backbone.to_bfloat16_except_poles_residues()
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
def _set_gradient_checkpointing(self, enable, gradient_checkpointing_func):
|
54 |
self.backbone.gradient_checkpointing = enable
|
|
|
55 |
|
56 |
def get_input_embeddings(self):
|
57 |
return self.backbone.embedding_layer
|
|
|
2 |
"""StripedHyena custom code port for the Hugging Face Hub"""
|
3 |
|
4 |
import torch
|
5 |
+
import functools
|
6 |
from torch.nn import functional as F
|
7 |
from .configuration_hyena import StripedHyenaConfig
|
8 |
from transformers import PreTrainedModel
|
|
|
51 |
def force_dtype(self):
|
52 |
self.backbone.to_bfloat16_except_poles_residues()
|
53 |
|
54 |
+
def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
|
55 |
+
if not self.supports_gradient_checkpointing:
|
56 |
+
raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
|
57 |
+
|
58 |
+
if gradient_checkpointing_kwargs is None:
|
59 |
+
gradient_checkpointing_kwargs = {"use_reentrant": True}
|
60 |
+
|
61 |
+
# TODO support deepspeed checkpoint
|
62 |
+
gradient_checkpointing_func = functools.partial(
|
63 |
+
torch.utils.checkpoint.checkpoint, **gradient_checkpointing_kwargs
|
64 |
+
)
|
65 |
+
|
66 |
+
self._set_gradient_checkpointing(
|
67 |
+
enable=True, gradient_checkpointing_func=gradient_checkpointing_func
|
68 |
+
)
|
69 |
+
|
70 |
+
if getattr(self, "_hf_peft_config_loaded", False):
|
71 |
+
# When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True
|
72 |
+
# we do it also on PEFT: https://github.com/huggingface/peft/blob/85013987aa82aa1af3da1236b6902556ce3e483e/src/peft/peft_model.py#L334
|
73 |
+
# When training with PEFT, only LoRA layers will have requires grad set to True, but the output of frozen layers need to propagate
|
74 |
+
# the gradients to make sure the gradient flows.
|
75 |
+
self.enable_input_require_grads()
|
76 |
+
|
77 |
def _set_gradient_checkpointing(self, enable, gradient_checkpointing_func):
|
78 |
self.backbone.gradient_checkpointing = enable
|
79 |
+
self.backbone._gradient_checkpointing_func = gradient_checkpointing_func
|
80 |
|
81 |
def get_input_embeddings(self):
|
82 |
return self.backbone.embedding_layer
|