CamiloVega commited on
Commit
29cdba6
·
verified ·
1 Parent(s): efa868e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -8
app.py CHANGED
@@ -56,12 +56,12 @@ class ModelManager:
56
  logger.info("Starting model initialization...")
57
  model_name = "meta-llama/Llama-2-7b-chat-hf"
58
 
59
- # Configure 4-bit quantization
60
  bnb_config = BitsAndBytesConfig(
61
- load_in_4bit=True,
62
- bnb_4bit_use_double_quant=True,
63
- bnb_4bit_quant_type="nf4",
64
- bnb_4bit_compute_dtype=torch.bfloat16
65
  )
66
 
67
  # Load tokenizer with optimized settings
@@ -82,9 +82,6 @@ class ModelManager:
82
  device_map="auto",
83
  torch_dtype=torch.float16,
84
  quantization_config=bnb_config,
85
- use_flash_attention_2=True,
86
- use_cache=True,
87
- attn_implementation="flash_attention_2",
88
  low_cpu_mem_usage=True,
89
  )
90
 
 
56
  logger.info("Starting model initialization...")
57
  model_name = "meta-llama/Llama-2-7b-chat-hf"
58
 
59
+ # Configure 8-bit quantization instead of 4-bit
60
  bnb_config = BitsAndBytesConfig(
61
+ load_in_8bit=True,
62
+ bnb_8bit_use_double_quant=True,
63
+ bnb_8bit_quant_type="nf8",
64
+ bnb_8bit_compute_dtype=torch.float16
65
  )
66
 
67
  # Load tokenizer with optimized settings
 
82
  device_map="auto",
83
  torch_dtype=torch.float16,
84
  quantization_config=bnb_config,
 
 
 
85
  low_cpu_mem_usage=True,
86
  )
87