Spaces:
Sleeping
Sleeping
billusanda007
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -10,9 +10,12 @@ from huggingface_hub import login
|
|
10 |
api_token = os.getenv('HF_TOKEN')
|
11 |
|
12 |
# Load pre-trained model and tokenizer
|
13 |
-
model_name = "gpt2"
|
14 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
15 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
|
|
|
|
|
|
16 |
model.eval()
|
17 |
|
18 |
def create_ngrams(tokens, n):
|
@@ -46,10 +49,12 @@ def kneser_ney_smoothing(ngram_counts, lower_order_counts, discount=0.75):
|
|
46 |
return probabilities
|
47 |
|
48 |
def generate_text_with_probs(initial_context, top_p, max_length, top_k):
|
49 |
-
input_ids = tokenizer.encode(initial_context, return_tensors="pt")
|
50 |
generated_text = initial_context
|
51 |
token_tables = []
|
52 |
|
|
|
|
|
53 |
with torch.no_grad():
|
54 |
for _ in range(max_length):
|
55 |
outputs = model(input_ids=input_ids)
|
@@ -62,42 +67,35 @@ def generate_text_with_probs(initial_context, top_p, max_length, top_k):
|
|
62 |
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
63 |
sorted_indices_to_remove[..., 0] = 0
|
64 |
|
65 |
-
# Convert boolean mask to indices to set logits to -inf
|
66 |
indices_to_remove = sorted_indices[sorted_indices_to_remove]
|
67 |
next_token_logits[:, indices_to_remove] = -float('Inf')
|
68 |
-
|
69 |
-
# Compute probabilities
|
70 |
probabilities = torch.softmax(next_token_logits, dim=-1)
|
71 |
|
72 |
-
# Get the next token using multinomial sampling
|
73 |
next_token = torch.multinomial(probabilities, num_samples=1)
|
74 |
-
|
75 |
-
# Get next token and its probability
|
76 |
next_token_prob = probabilities[0, next_token].item()
|
77 |
next_token_text = tokenizer.decode(next_token.item())
|
78 |
|
79 |
-
|
80 |
-
top_tokens = sorted_indices[0, :top_k] # Get top k tokens
|
81 |
top_probs = probabilities[0, top_tokens]
|
82 |
top_token_probs = [(tokenizer.decode([token.item()]), prob.item()) for token, prob in zip(top_tokens, top_probs)]
|
83 |
|
84 |
-
# Create DataFrame for current token's top-k probabilities
|
85 |
df = pd.DataFrame(top_token_probs, columns=["Token", "Probability"])
|
86 |
-
df.index = df.index + 1
|
87 |
-
token_tables.append((f"Next token: {next_token_text} (Probability: {next_token_prob:.
|
|
|
88 |
|
89 |
-
# Add the next token to the input_ids
|
90 |
input_ids = torch.cat([input_ids, next_token], dim=-1)
|
91 |
|
92 |
if next_token.item() == tokenizer.eos_token_id:
|
93 |
break
|
94 |
|
95 |
-
# Decode the generated text
|
96 |
generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
|
97 |
|
98 |
-
return generated_text, token_tables
|
99 |
|
100 |
def predict_next_token_ngram(input_text, context_text, max_length):
|
|
|
|
|
101 |
context_tokens = tokenizer.tokenize(context_text)
|
102 |
four_grams = create_ngrams(context_tokens, 4)
|
103 |
four_gram_counts = Counter(four_grams)
|
@@ -106,14 +104,17 @@ def predict_next_token_ngram(input_text, context_text, max_length):
|
|
106 |
probs = kneser_ney_smoothing(four_gram_counts, three_gram_counts)
|
107 |
|
108 |
input_tokens = tokenizer.tokenize(input_text)
|
|
|
109 |
generated_text = input_text
|
110 |
token_tables = []
|
111 |
|
112 |
-
if len(input_tokens) >= max_length:
|
113 |
generated_text = tokenizer.convert_tokens_to_string(input_tokens)
|
114 |
return generated_text, token_tables
|
115 |
|
116 |
-
|
|
|
|
|
117 |
input_3_gram = tuple(input_tokens[-3:])
|
118 |
next_token_probs = probs.get(input_3_gram, {})
|
119 |
if not next_token_probs:
|
@@ -121,17 +122,17 @@ def predict_next_token_ngram(input_text, context_text, max_length):
|
|
121 |
next_token = max(next_token_probs, key=next_token_probs.get)
|
122 |
input_tokens.append(next_token)
|
123 |
|
124 |
-
# Get top tokens and their probabilities
|
125 |
top_k = 4
|
126 |
top_k_tokens = sorted(next_token_probs.items(), key=lambda x: x[1], reverse=True)[:top_k]
|
127 |
top_k_tokens_df = pd.DataFrame(top_k_tokens, columns=["Token", "Probability"])
|
128 |
top_k_tokens_df.index = top_k_tokens_df.index + 1 # Add numbering to the DataFrame
|
129 |
top_k_tokens_df["Token"] = top_k_tokens_df["Token"].apply(lambda x: tokenizer.convert_tokens_to_string([x]))
|
130 |
|
131 |
-
token_tables.append((f"Next token: {next_token}
|
|
|
132 |
|
133 |
generated_text = tokenizer.convert_tokens_to_string(input_tokens)
|
134 |
-
return generated_text, token_tables
|
135 |
|
136 |
def combined_model_predictions(context_text, initial_context, top_p, max_length, top_k):
|
137 |
generated_text, token_tables = generate_text_with_probs(initial_context, top_p, max_length, top_k)
|
@@ -146,17 +147,15 @@ iface = gr.Interface(
|
|
146 |
gr.Textbox(lines=2, placeholder="Enter initial context here..."),
|
147 |
gr.Slider(0, 1, step=0.01, value=0.9, label="Top-p (nucleus) sampling"),
|
148 |
gr.Slider(1, 100, step=1, value=50, label="Max length"),
|
149 |
-
gr.Slider(1, 50, step=1, value=10, label="Top-k"),
|
150 |
],
|
151 |
outputs=[
|
152 |
gr.Textbox(label="Generated Text"),
|
153 |
-
gr.Dataframe(label="LLM Token Probabilities"),
|
154 |
gr.Textbox(label="N-gram Generated Text"),
|
155 |
-
gr.Dataframe(label="N-gram Token Predictions"),
|
156 |
],
|
157 |
-
title="Next Token Visualizer (GPT-2 -
|
158 |
-
description="Generate text using GPT-2 with top-p (nucleus) sampling and see the probabilities of generated tokens in tables, along with N-gram model predictions.",
|
159 |
)
|
160 |
|
161 |
-
# Launch the Gradio app
|
162 |
iface.launch()
|
|
|
10 |
api_token = os.getenv('HF_TOKEN')
|
11 |
|
12 |
# Load pre-trained model and tokenizer
|
13 |
+
model_name = "gpt2-large"
|
14 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
15 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
16 |
+
|
17 |
+
device = torch.device("mps") if torch.has_mps else torch.device("cpu")
|
18 |
+
model.to(device)
|
19 |
model.eval()
|
20 |
|
21 |
def create_ngrams(tokens, n):
|
|
|
49 |
return probabilities
|
50 |
|
51 |
def generate_text_with_probs(initial_context, top_p, max_length, top_k):
|
52 |
+
input_ids = tokenizer.encode(initial_context, return_tensors="pt").to(device)
|
53 |
generated_text = initial_context
|
54 |
token_tables = []
|
55 |
|
56 |
+
token_no = 1
|
57 |
+
|
58 |
with torch.no_grad():
|
59 |
for _ in range(max_length):
|
60 |
outputs = model(input_ids=input_ids)
|
|
|
67 |
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
68 |
sorted_indices_to_remove[..., 0] = 0
|
69 |
|
|
|
70 |
indices_to_remove = sorted_indices[sorted_indices_to_remove]
|
71 |
next_token_logits[:, indices_to_remove] = -float('Inf')
|
|
|
|
|
72 |
probabilities = torch.softmax(next_token_logits, dim=-1)
|
73 |
|
|
|
74 |
next_token = torch.multinomial(probabilities, num_samples=1)
|
|
|
|
|
75 |
next_token_prob = probabilities[0, next_token].item()
|
76 |
next_token_text = tokenizer.decode(next_token.item())
|
77 |
|
78 |
+
top_tokens = sorted_indices[0, :top_k]
|
|
|
79 |
top_probs = probabilities[0, top_tokens]
|
80 |
top_token_probs = [(tokenizer.decode([token.item()]), prob.item()) for token, prob in zip(top_tokens, top_probs)]
|
81 |
|
|
|
82 |
df = pd.DataFrame(top_token_probs, columns=["Token", "Probability"])
|
83 |
+
df.index = df.index + 1
|
84 |
+
token_tables.append((f"{token_no}>> Next token: {next_token_text} (Probability: {next_token_prob:.8f})", df))
|
85 |
+
token_no+=1
|
86 |
|
|
|
87 |
input_ids = torch.cat([input_ids, next_token], dim=-1)
|
88 |
|
89 |
if next_token.item() == tokenizer.eos_token_id:
|
90 |
break
|
91 |
|
|
|
92 |
generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
|
93 |
|
94 |
+
return generated_text[len(initial_context):], token_tables
|
95 |
|
96 |
def predict_next_token_ngram(input_text, context_text, max_length):
|
97 |
+
|
98 |
+
ip = input_text
|
99 |
context_tokens = tokenizer.tokenize(context_text)
|
100 |
four_grams = create_ngrams(context_tokens, 4)
|
101 |
four_gram_counts = Counter(four_grams)
|
|
|
104 |
probs = kneser_ney_smoothing(four_gram_counts, three_gram_counts)
|
105 |
|
106 |
input_tokens = tokenizer.tokenize(input_text)
|
107 |
+
generated_tokens = input_tokens.copy()
|
108 |
generated_text = input_text
|
109 |
token_tables = []
|
110 |
|
111 |
+
if len(input_tokens) >= (max_length + len(generated_tokens)):
|
112 |
generated_text = tokenizer.convert_tokens_to_string(input_tokens)
|
113 |
return generated_text, token_tables
|
114 |
|
115 |
+
token_no = 1
|
116 |
+
|
117 |
+
while len(input_tokens) < (max_length + len(generated_tokens)):
|
118 |
input_3_gram = tuple(input_tokens[-3:])
|
119 |
next_token_probs = probs.get(input_3_gram, {})
|
120 |
if not next_token_probs:
|
|
|
122 |
next_token = max(next_token_probs, key=next_token_probs.get)
|
123 |
input_tokens.append(next_token)
|
124 |
|
|
|
125 |
top_k = 4
|
126 |
top_k_tokens = sorted(next_token_probs.items(), key=lambda x: x[1], reverse=True)[:top_k]
|
127 |
top_k_tokens_df = pd.DataFrame(top_k_tokens, columns=["Token", "Probability"])
|
128 |
top_k_tokens_df.index = top_k_tokens_df.index + 1 # Add numbering to the DataFrame
|
129 |
top_k_tokens_df["Token"] = top_k_tokens_df["Token"].apply(lambda x: tokenizer.convert_tokens_to_string([x]))
|
130 |
|
131 |
+
token_tables.append((f"{token_no}>> Next token: {next_token}", top_k_tokens_df))
|
132 |
+
token_no+=1
|
133 |
|
134 |
generated_text = tokenizer.convert_tokens_to_string(input_tokens)
|
135 |
+
return generated_text[len(ip):], token_tables
|
136 |
|
137 |
def combined_model_predictions(context_text, initial_context, top_p, max_length, top_k):
|
138 |
generated_text, token_tables = generate_text_with_probs(initial_context, top_p, max_length, top_k)
|
|
|
147 |
gr.Textbox(lines=2, placeholder="Enter initial context here..."),
|
148 |
gr.Slider(0, 1, step=0.01, value=0.9, label="Top-p (nucleus) sampling"),
|
149 |
gr.Slider(1, 100, step=1, value=50, label="Max length"),
|
150 |
+
gr.Slider(1, 50, step=1, value=10, label="Top-k"),
|
151 |
],
|
152 |
outputs=[
|
153 |
gr.Textbox(label="Generated Text"),
|
154 |
+
gr.Dataframe(label="LLM Token Probabilities"),
|
155 |
gr.Textbox(label="N-gram Generated Text"),
|
156 |
+
gr.Dataframe(label="N-gram Token Predictions"),
|
157 |
],
|
158 |
+
title="Next Token Visualizer (GPT-2-large - 812M param.)"
|
|
|
159 |
)
|
160 |
|
|
|
161 |
iface.launch()
|