Spaces:

DataChem
/

custom-api

Paused

App Files Files Community

DataChem commited on Dec 29, 2024

Commit

1f3e16d

verified ·

1 Parent(s): 977cc0a

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -44

app.py CHANGED Viewed

@@ -7,49 +7,25 @@ import torch.nn.functional as F
 app = FastAPI()
-# Retrieve the token from environment variable
-hf_token = os.environ.get("HF_AUTH_TOKEN", None)
-if hf_token is None:
-    print("WARNING: No HF_AUTH_TOKEN found in environment. "
-          "Make sure to set a Hugging Face token if the model is gated.")
 # -------------------------------------------------------------------------
-# Update this to the Llama 2 Chat model you prefer. This example uses the
-# 7B chat version. For larger models (13B, 70B), ensure you have enough RAM.
 # -------------------------------------------------------------------------
-model_name = "meta-llama/Llama-2-7b-chat-hf"
-# -------------------------------------------------------------------------
-# If the repo is gated, you may need:
-#   use_auth_token="YOUR_HF_TOKEN",
-#   trust_remote_code=True,
-# or you can set environment variables in your HF Space to authenticate.
-# -------------------------------------------------------------------------
-print(f"Loading model/tokenizer from: {model_name}")
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
-    trust_remote_code=True,
-    use_auth_token=hf_token,
 )
-# -------------------------------------------------------------------------
-# If you had GPU available, you might do:
-# model = AutoModelForCausalLM.from_pretrained(
-#     model_name,
-#     torch_dtype=torch.float16,
-#     device_map="auto",
-#     trust_remote_code=True
-# )
-# But for CPU, we do a simpler load:
-# -------------------------------------------------------------------------
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
-    trust_remote_code=True,
-    use_auth_token=hf_token,
 )
-# Choose device based on availability
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 model.to(device)
@@ -57,9 +33,9 @@ model.to(device)
 @app.post("/predict")
 async def predict(request: Request):
     """
-    Endpoint for streaming responses from the Llama 2 chat model.
     Expects JSON: { "prompt": "<Your prompt>" }
-    Returns a text/event-stream of tokens.
     """
     data = await request.json()
     prompt = data.get("prompt", "")
@@ -72,19 +48,16 @@ async def predict(request: Request):
     attention_mask = inputs.attention_mask   # same shape
     def token_generator():
-        """
-        A generator that yields tokens one by one for SSE streaming.
-        """
         nonlocal input_ids, attention_mask
         # Basic generation hyperparameters
         temperature = 0.7
         top_p = 0.9
-        max_new_tokens = 30  # Increase for longer outputs
         for _ in range(max_new_tokens):
             with torch.no_grad():
-                # 1) Forward pass: compute logits for next token
                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                 next_token_logits = outputs.logits[:, -1, :]
@@ -101,7 +74,7 @@ async def predict(request: Request):
                 filtered_probs = sorted_probs[valid_indices]
                 filtered_indices = sorted_indices[valid_indices]
-                # 5) If no tokens are valid under top_p, fallback to greedy
                 if len(filtered_probs) == 0:
                     next_token_id = torch.argmax(next_token_probs)
                 else:
@@ -115,18 +88,18 @@ async def predict(request: Request):
                 # shape [1] => [1,1]
                 next_token_id = next_token_id.unsqueeze(-1)
-                # 7) Append token to input_ids
                 input_ids = torch.cat([input_ids, next_token_id], dim=-1)
-                # 8) Update attention_mask for the new token
                 new_mask = attention_mask.new_ones((attention_mask.size(0), 1))
                 attention_mask = torch.cat([attention_mask, new_mask], dim=-1)
-                # 9) Decode and yield
                 token = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
                 yield token + " "
-                # 10) Stop if we encounter EOS
                 if tokenizer.eos_token_id is not None:
                     if next_token_id.squeeze().item() == tokenizer.eos_token_id:
                         break

 app = FastAPI()
 # -------------------------------------------------------------------------
+# Since Falcon 7B Instruct is not gated, you do NOT need an HF token.
+# We omit any 'use_auth_token' parameter.
 # -------------------------------------------------------------------------
+model_name = "tiiuae/falcon-7b-instruct"
+print(f"Loading tokenizer from: {model_name}")
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
+    trust_remote_code=True
 )
+print(f"Loading model from: {model_name}")
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
+    trust_remote_code=True
 )
+# Choose device based on availability (CPU or GPU)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 model.to(device)
 @app.post("/predict")
 async def predict(request: Request):
     """
+    Endpoint for streaming responses from Falcon-7B-Instruct.
     Expects JSON: { "prompt": "<Your prompt>" }
+    Returns a text/event-stream of tokens (SSE).
     """
     data = await request.json()
     prompt = data.get("prompt", "")
     attention_mask = inputs.attention_mask   # same shape
     def token_generator():
         nonlocal input_ids, attention_mask
         # Basic generation hyperparameters
         temperature = 0.7
         top_p = 0.9
+        max_new_tokens = 30  # Increase if you want longer outputs
         for _ in range(max_new_tokens):
             with torch.no_grad():
+                # 1) Forward pass: compute logits for the next token
                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                 next_token_logits = outputs.logits[:, -1, :]
                 filtered_probs = sorted_probs[valid_indices]
                 filtered_indices = sorted_indices[valid_indices]
+                # 5) If no tokens remain after filtering, fall back to greedy
                 if len(filtered_probs) == 0:
                     next_token_id = torch.argmax(next_token_probs)
                 else:
                 # shape [1] => [1,1]
                 next_token_id = next_token_id.unsqueeze(-1)
+                # 7) Append the new token to input_ids
                 input_ids = torch.cat([input_ids, next_token_id], dim=-1)
+                # 8) Update the attention mask
                 new_mask = attention_mask.new_ones((attention_mask.size(0), 1))
                 attention_mask = torch.cat([attention_mask, new_mask], dim=-1)
+                # 9) Decode and yield the generated token
                 token = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
                 yield token + " "
+                # 10) Stop if EOS token is generated (if the model uses one)
                 if tokenizer.eos_token_id is not None:
                     if next_token_id.squeeze().item() == tokenizer.eos_token_id:
                         break