vilarin commited on
Commit
d8a8bf1
·
verified ·
1 Parent(s): 659ca36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -4
app.py CHANGED
@@ -9,7 +9,7 @@ import os
9
  import time
10
  import spaces
11
  import torch
12
- from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, TextIteratorStreamer
13
  import gradio as gr
14
  from threading import Thread
15
 
@@ -39,14 +39,18 @@ h3 {
39
 
40
  device = "cuda" # for GPU usage or "cpu" for CPU usage
41
 
 
 
 
 
 
 
42
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
43
  model = AutoModelForCausalLM.from_pretrained(
44
  MODEL,
45
  torch_dtype=torch.float16,
46
  device_map="auto",
47
- load_in_8bit=False,
48
- load_in_4bit=True,
49
- use_flash_attention_2=True)
50
 
51
  # Ensure `pad_token_id` is set
52
  if tokenizer.pad_token_id is None:
 
9
  import time
10
  import spaces
11
  import torch
12
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
13
  import gradio as gr
14
  from threading import Thread
15
 
 
39
 
40
  device = "cuda" # for GPU usage or "cpu" for CPU usage
41
 
42
+ quantization_config = BitsAndBytesConfig(
43
+ load_in_4bit=True,
44
+ bnb_4bit_compute_dtype=torch.bfloat16,
45
+ bnb_4bit_use_double_quant=True,
46
+ bnb_4bit_quant_type= "nf4")
47
+
48
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
49
  model = AutoModelForCausalLM.from_pretrained(
50
  MODEL,
51
  torch_dtype=torch.float16,
52
  device_map="auto",
53
+ quantization_config=quantization_config)
 
 
54
 
55
  # Ensure `pad_token_id` is set
56
  if tokenizer.pad_token_id is None: