Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 26, 2024

Commit

21d94a3

verified ·

1 Parent(s): b2672e5

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +94 -76

modeling_quiet.py CHANGED Viewed

@@ -23,6 +23,7 @@ import math
 import copy
 import os
 import time
 import seaborn as sns
 import matplotlib.pyplot as plt
 import wandb
@@ -68,6 +69,73 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "QuietConfig"
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -257,13 +325,6 @@ class QuietAttention(nn.Module):
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if past_key_value is not None:
-            expected_attention_mask_size = (bsz, 1, q_len, q_len + past_key_value.get_usable_length(q_len, self.layer_idx))
-            if attention_mask.size() != expected_attention_mask_size:
-                # Assuming the attention mask is larger than expected, slice it to match the expected size
-                attention_mask = attention_mask[:, :, :, -expected_attention_mask_size[-1]:]
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
@@ -277,10 +338,6 @@ class QuietAttention(nn.Module):
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        query_states = query_states.to(attention_mask.dtype)
-        key_states = key_states.to(attention_mask.dtype)
-        value_states = value_states.to(attention_mask.dtype)
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
@@ -311,16 +368,11 @@ class QuietAttention(nn.Module):
             )
         if attention_mask is not None:
-            if attention_mask.dim() == 3:
-                attention_mask = attention_mask.unsqueeze(1)
-            elif attention_mask.dim() == 2:
-                attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            if attention_mask.size(0) != bsz or attention_mask.size(-1) != kv_seq_len:
                 raise ValueError(
-                    f"Attention mask should be of size ({bsz}, 1, q_len, {kv_seq_len}), but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
@@ -697,21 +749,11 @@ class QuietSdpaAttention(QuietAttention):
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         if attention_mask is not None:
-            if attention_mask.dim() == 3:
-                attention_mask = attention_mask.unsqueeze(1)
-            elif attention_mask.dim() == 2:
-                attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        if attention_mask is not None:
-            if attention_mask.dim() == 3:
-                attention_mask = attention_mask.unsqueeze(1)
-            elif attention_mask.dim() == 2:
-                attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            if attention_mask.size(0) != bsz or attention_mask.size(-1) != kv_seq_len:
                 raise ValueError(
-                    f"Attention mask should be of size ({bsz}, 1, q_len, {kv_seq_len}), but is {attention_mask.size()}"
                 )
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
         if query_states.device.type == "cuda" and attention_mask is not None:
@@ -719,12 +761,6 @@ class QuietSdpaAttention(QuietAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
-        # Cast query_states, key_states, and value_states to the same data type as attention_mask
-        query_states = query_states.to(attention_mask.dtype)
-        key_states = key_states.to(attention_mask.dtype)
-        value_states = value_states.to(attention_mask.dtype)
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
@@ -1291,28 +1327,8 @@ class QuietForCausalLM(QuietPreTrainedModel):
         # Generate the continuation
         continuation_length = self.n_ahead - 2
         new_key_values = past_key_values
-        if self.n_ahead != 1 or self.n_ahead_talk != 1 or self.comparison_mode:
-            if attention_mask is None:
-                base_attention_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=0).to(input_ids.device)
-                base_attention_mask = base_attention_mask.view(1, 1, seq_len, seq_len)
-                base_attention_mask = base_attention_mask.repeat(input_ids.shape[0], 1, 1, 1)
-                attention_mask = base_attention_mask
-            elif attention_mask.dim() == 2:
-                if seq_len + past_key_values_length != attention_mask.shape[-1]:
-                    attention_mask = torch.cat(
-                        [torch.ones((attention_mask.shape[0], past_key_values_length), dtype=attention_mask.dtype, device=attention_mask.device), attention_mask],
-                        dim=-1
-                    )
-                attention_mask = _prepare_4d_causal_attention_mask(
-                    attention_mask,
-                    (batch_size, seq_len),
-                    inputs_embeds,
-                    past_key_values_length,
-                    sliding_window=self.config.sliding_window,
-                )
-        start_time = time.time()
         for continuation_idx in range(continuation_length):
             outputs = self.model(
                 input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
@@ -1326,9 +1342,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
                 return_dict=return_dict,
             )
             new_key_values = outputs.past_key_values
             hidden_states = outputs[0]
             logits = self.lm_head(hidden_states)
             logits = logits[:, -1, :]  # Only consider the last token
@@ -1338,12 +1352,17 @@ class QuietForCausalLM(QuietPreTrainedModel):
             # Append the generated token to the input sequence
             input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
             seq_len += 1
             # Update the attention mask
             if attention_mask is not None:
                 attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Append the end thought token to the input sequence
         end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
         input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
@@ -1389,7 +1408,12 @@ class QuietForCausalLM(QuietPreTrainedModel):
         # Apply the language model head to get the final logits
         logits = self.lm_head(mixed_hidden_states)
-        return logits
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@@ -1662,9 +1686,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
             prev_rm_logits = rm_logits  # for policy gradient
             prev_rm_tokens = cur_rm_tokens  # for policy gradient
-            hidden_states_lm = hidden_states
-            logits = self.lm_head(hidden_states_lm)
             if ahead_idx == 0:
                 hidden_states_lm = hidden_states
                 logits = self.lm_head(hidden_states_lm)
@@ -1682,16 +1703,14 @@ class QuietForCausalLM(QuietPreTrainedModel):
                     assert self.no_residual
                     residual_logits = self.lm_head(hidden_states)
                     talk_hidden_states = hidden_states
-                if 'hidden_states_lm' not in locals():
-                    hidden_states_lm = hidden_states
-                rm_hidden_states = hidden_states
-                if ahead_idx > self.n_ahead - 1:
-                    cur_base_hidden = torch.cat([
-                        base_hidden_states[..., ahead_idx - self.n_ahead + 1:, :],
-                        base_hidden_states[..., :ahead_idx - self.n_ahead + 1, :]
-                    ], dim=-2)
                 else:
-                    cur_base_hidden = base_hidden_states
                     if self.use_concat_talk_head:
                         # concatenate the hidden states with the original hidden states
@@ -1782,7 +1801,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
                         if not self.comparison_mode and not (self.optimize_lm_head_only_at_start and (self.n_ahead + self.n_ahead_talk > 2)) or self.original_mode:
                             loss_list.append(loss)
                         talk_loss_list.append(nonzero_mean(loss).detach())
             if not attempted or self.comparison_mode:
                 rm_hidden_states = hidden_states
@@ -2366,4 +2384,4 @@ class QuietForSequenceClassification(QuietPreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

 import copy
 import os
 import time
+import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import wandb
 _CONFIG_FOR_DOC = "QuietConfig"
+from reportlab.pdfgen import canvas
+from reportlab.lib.pagesizes import letter
+from reportlab.lib.colors import HexColor
+def save_tokens_with_rewards_to_pdf(input_ids, token_rewards, tokenizer, output_file="text.pdf", eps=0.2, eps2=0.5):
+    c = canvas.Canvas(output_file, pagesize=letter)
+    c.setFont("Courier", 8)
+    x, y = 50, 750
+    previous_text = ""
+    current_text = ""
+    for token_idx, reward in enumerate(token_rewards):
+        current_text = tokenizer.decode(input_ids[: token_idx + 1])
+        if current_text != previous_text:
+            diff_text = current_text[len(previous_text) :]
+            if "\n" in diff_text:
+                lines = diff_text.split("\n")
+                for line_idx, line in enumerate(lines):
+                    if line_idx > 0:
+                        x = 50
+                        y -= 12
+                    if abs(reward) < eps:
+                        opacity = 0
+                    elif abs(reward) > eps2:
+                        opacity = 0.8
+                    else:
+                        opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
+                    text_width = c.stringWidth(line)
+                    if reward > 0:
+                        highlight_color = HexColor("#4CCD99")
+                    else:
+                        highlight_color = HexColor("#FFC700")
+                    highlight_color.alpha = opacity
+                    c.setFillColor(highlight_color)
+                    c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
+                    c.setFillColor(HexColor("#000000"))
+                    c.drawString(x, y, line)
+                    x += text_width
+            else:
+                if abs(reward) < eps:
+                    opacity = 0
+                elif abs(reward) > eps2:
+                    opacity = 0.8
+                else:
+                    opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
+                text_width = c.stringWidth(diff_text)
+                if reward > 0:
+                    highlight_color = HexColor("#4CCD99")
+                else:
+                    highlight_color = HexColor("#FFC700")
+                highlight_color.alpha = opacity
+                c.setFillColor(highlight_color)
+                c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
+                c.setFillColor(HexColor("#000000"))
+                c.drawString(x, y, diff_text)
+                x += text_width
+            if x > 550:
+                x = 50
+                y -= 12
+            if y < 50:
+                c.showPage()
+                y = 750
+                x = 50
+            previous_text = current_text
+    c.showPage()
+    c.save()
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             )
         if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
         if query_states.device.type == "cuda" and attention_mask is not None:
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
         # Generate the continuation
         continuation_length = self.n_ahead - 2
         new_key_values = past_key_values
+        generated_tokens = []
         for continuation_idx in range(continuation_length):
             outputs = self.model(
                 input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
                 return_dict=return_dict,
             )
             new_key_values = outputs.past_key_values
             hidden_states = outputs[0]
             logits = self.lm_head(hidden_states)
             logits = logits[:, -1, :]  # Only consider the last token
             # Append the generated token to the input sequence
             input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
+            generated_tokens.append(next_token_id)
             seq_len += 1
             # Update the attention mask
             if attention_mask is not None:
                 attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+            # Update the position ids
+            if position_ids is not None:
+                position_ids = torch.cat([position_ids, (position_ids[:, -1] + 1).unsqueeze(-1)], dim=-1)
         # Append the end thought token to the input sequence
         end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
         input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
         # Apply the language model head to get the final logits
         logits = self.lm_head(mixed_hidden_states)
+        # Decode the logits to get the generated text
+        generated_tokens = torch.cat(generated_tokens, dim=-1)
+        generated_text = self.tokenizer.decode(generated_tokens.squeeze(), skip_special_tokens=True)
+        return generated_text
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
             prev_rm_logits = rm_logits  # for policy gradient
             prev_rm_tokens = cur_rm_tokens  # for policy gradient
             if ahead_idx == 0:
                 hidden_states_lm = hidden_states
                 logits = self.lm_head(hidden_states_lm)
                     assert self.no_residual
                     residual_logits = self.lm_head(hidden_states)
                     talk_hidden_states = hidden_states
                 else:
+                    if ahead_idx > self.n_ahead - 1:
+                        cur_base_hidden = torch.cat([
+                            base_hidden_states[..., ahead_idx - self.n_ahead + 1:, :],
+                            base_hidden_states[..., :ahead_idx - self.n_ahead + 1, :]
+                        ], dim=-2)
+                    else:
+                        cur_base_hidden = base_hidden_states
                     if self.use_concat_talk_head:
                         # concatenate the hidden states with the original hidden states
                         if not self.comparison_mode and not (self.optimize_lm_head_only_at_start and (self.n_ahead + self.n_ahead_talk > 2)) or self.original_mode:
                             loss_list.append(loss)
                         talk_loss_list.append(nonzero_mean(loss).detach())
             if not attempted or self.comparison_mode:
                 rm_hidden_states = hidden_states
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )