fullstuckdev commited on
Commit
ce875c8
·
1 Parent(s): 93374aa

update script

Browse files
Files changed (1) hide show
  1. app.py +133 -49
app.py CHANGED
@@ -93,48 +93,77 @@ async def root():
93
  async def generate_text(request: GenerateRequest):
94
  """
95
  Generate medical text based on input prompt
96
-
97
- Parameters:
98
- - text: Input text prompt
99
- - max_length: Maximum length of generated text
100
- - temperature: Sampling temperature (0.0 to 1.0)
101
- - num_return_sequences: Number of sequences to generate
102
-
103
- Returns:
104
- - List of generated text sequences
105
  """
106
  try:
 
107
  if model is None or tokenizer is None:
108
- raise HTTPException(status_code=500, detail="Model not loaded")
 
 
 
 
109
 
110
- inputs = tokenizer(
111
- request.text,
112
- return_tensors="pt",
113
- padding=True,
114
- truncation=True,
115
- max_length=request.max_length
116
- ).to(model.device)
117
 
118
- with torch.no_grad():
119
- generated_ids = model.generate(
120
- inputs.input_ids,
121
- max_length=request.max_length,
122
- num_return_sequences=request.num_return_sequences,
123
- temperature=request.temperature,
124
- pad_token_id=tokenizer.pad_token_id,
125
- eos_token_id=tokenizer.eos_token_id,
126
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- generated_texts = [
129
- tokenizer.decode(g, skip_special_tokens=True)
130
- for g in generated_ids
131
- ]
 
 
 
 
 
 
132
 
133
  return GenerateResponse(generated_text=generated_texts)
134
 
 
 
135
  except Exception as e:
136
- logger.error(f"Generation error: {str(e)}")
137
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
138
 
139
  @app.get("/health", tags=["Health"])
140
  async def health_check():
@@ -297,25 +326,80 @@ def init_model():
297
  device = "cuda" if torch.cuda.is_available() else "cpu"
298
  logger.info(f"Loading model on device: {device}")
299
 
300
- # Try to load fine-tuned model if it exists
301
- if os.path.exists(model_output_path):
302
- tokenizer = AutoTokenizer.from_pretrained(model_output_path)
303
- model = AutoModelForCausalLM.from_pretrained(
304
- model_output_path,
305
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
306
- device_map="auto"
307
- )
308
- else:
309
- # Load base model if no fine-tuned model exists
310
- model_name = "nvidia/Meta-Llama-3.2-3B-Instruct-ONNX-INT4"
311
- tokenizer = AutoTokenizer.from_pretrained(model_name)
312
- model = AutoModelForCausalLM.from_pretrained(
313
- model_name,
314
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
315
- device_map="auto"
316
- )
 
 
 
 
 
317
 
 
318
  return tokenizer, model
 
319
  except Exception as e:
320
  logger.error(f"Model initialization error: {str(e)}")
321
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  async def generate_text(request: GenerateRequest):
94
  """
95
  Generate medical text based on input prompt
 
 
 
 
 
 
 
 
 
96
  """
97
  try:
98
+ # Check if model is loaded
99
  if model is None or tokenizer is None:
100
+ logger.error("Model or tokenizer not initialized")
101
+ raise HTTPException(
102
+ status_code=500,
103
+ detail="Model not loaded. Please check if model was initialized correctly."
104
+ )
105
 
106
+ logger.info(f"Generating text for input: {request.text[:50]}...")
107
+
108
+ # Log device information
109
+ device_info = f"Using device: {model.device}"
110
+ logger.info(device_info)
 
 
111
 
112
+ # Tokenize input
113
+ try:
114
+ inputs = tokenizer(
115
+ request.text,
116
+ return_tensors="pt",
117
+ padding=True,
118
+ truncation=True,
119
+ max_length=request.max_length
120
  )
121
+ logger.info("Input tokenized successfully")
122
+
123
+ # Move inputs to correct device
124
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
125
+
126
+ except Exception as e:
127
+ logger.error(f"Tokenization error: {str(e)}")
128
+ raise HTTPException(status_code=500, detail=f"Tokenization failed: {str(e)}")
129
+
130
+ # Generate text
131
+ try:
132
+ with torch.no_grad():
133
+ generated_ids = model.generate(
134
+ inputs.input_ids,
135
+ max_length=request.max_length,
136
+ num_return_sequences=request.num_return_sequences,
137
+ temperature=request.temperature,
138
+ pad_token_id=tokenizer.pad_token_id,
139
+ eos_token_id=tokenizer.eos_token_id,
140
+ )
141
+ logger.info("Text generated successfully")
142
+ except Exception as e:
143
+ logger.error(f"Generation error: {str(e)}")
144
+ raise HTTPException(status_code=500, detail=f"Text generation failed: {str(e)}")
145
 
146
+ # Decode generated text
147
+ try:
148
+ generated_texts = [
149
+ tokenizer.decode(g, skip_special_tokens=True)
150
+ for g in generated_ids
151
+ ]
152
+ logger.info("Text decoded successfully")
153
+ except Exception as e:
154
+ logger.error(f"Decoding error: {str(e)}")
155
+ raise HTTPException(status_code=500, detail=f"Text decoding failed: {str(e)}")
156
 
157
  return GenerateResponse(generated_text=generated_texts)
158
 
159
+ except HTTPException as he:
160
+ raise he
161
  except Exception as e:
162
+ logger.error(f"Unexpected error: {str(e)}")
163
+ raise HTTPException(
164
+ status_code=500,
165
+ detail=f"An unexpected error occurred: {str(e)}"
166
+ )
167
 
168
  @app.get("/health", tags=["Health"])
169
  async def health_check():
 
326
  device = "cuda" if torch.cuda.is_available() else "cpu"
327
  logger.info(f"Loading model on device: {device}")
328
 
329
+ model_name = "nvidia/Meta-Llama-3.2-3B-Instruct-ONNX-INT4"
330
+
331
+ # Load tokenizer
332
+ logger.info("Loading tokenizer...")
333
+ tokenizer = AutoTokenizer.from_pretrained(
334
+ model_name,
335
+ cache_dir="/app/cache",
336
+ trust_remote_code=True
337
+ )
338
+
339
+ # Add padding token if not present
340
+ if tokenizer.pad_token is None:
341
+ tokenizer.pad_token = tokenizer.eos_token
342
+
343
+ logger.info("Loading model...")
344
+ model = AutoModelForCausalLM.from_pretrained(
345
+ model_name,
346
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
347
+ device_map="auto",
348
+ cache_dir="/app/cache",
349
+ trust_remote_code=True
350
+ )
351
 
352
+ logger.info(f"Model loaded successfully on {device}")
353
  return tokenizer, model
354
+
355
  except Exception as e:
356
  logger.error(f"Model initialization error: {str(e)}")
357
+ raise
358
+
359
+ @app.get("/model-status", tags=["Health"])
360
+ async def model_status():
361
+ """
362
+ Get detailed model status
363
+ """
364
+ try:
365
+ model_info = {
366
+ "model_loaded": model is not None,
367
+ "tokenizer_loaded": tokenizer is not None,
368
+ "model_device": str(model.device) if model else None,
369
+ "gpu_available": torch.cuda.is_available(),
370
+ "cuda_device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
371
+ "cuda_device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else None,
372
+ "model_type": type(model).__name__ if model else None,
373
+ "tokenizer_type": type(tokenizer).__name__ if tokenizer else None,
374
+ }
375
+
376
+ if model is not None:
377
+ try:
378
+ # Test tokenizer
379
+ test_input = tokenizer("test", return_tensors="pt")
380
+ model_info["tokenizer_working"] = True
381
+ except Exception as e:
382
+ model_info["tokenizer_working"] = False
383
+ model_info["tokenizer_error"] = str(e)
384
+
385
+ try:
386
+ # Test model forward pass
387
+ with torch.no_grad():
388
+ test_output = model.generate(
389
+ test_input.input_ids.to(model.device),
390
+ max_length=10
391
+ )
392
+ model_info["model_working"] = True
393
+ except Exception as e:
394
+ model_info["model_working"] = False
395
+ model_info["model_error"] = str(e)
396
+
397
+ return model_info
398
+
399
+ except Exception as e:
400
+ logger.error(f"Error checking model status: {str(e)}")
401
+ return {
402
+ "error": str(e),
403
+ "model_loaded": model is not None,
404
+ "tokenizer_loaded": tokenizer is not None
405
+ }