xuxw98 commited on
Commit
ec09e34
·
1 Parent(s): 3e4df34

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -3
app.py CHANGED
@@ -2,7 +2,20 @@ import sys
2
  import time
3
  import warnings
4
  from pathlib import Path
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # 配置hugface环境
8
  from huggingface_hub import hf_hub_download
@@ -12,8 +25,34 @@ import glob
12
  import json
13
 
14
  # os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
15
- # torch.set_float32_matmul_precision("high")
 
 
 
 
 
 
 
 
 
 
 
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  def instruct_generate(
@@ -42,17 +81,39 @@ def instruct_generate(
42
  top_k: The number of top most probable tokens to consider in the sampling process.
43
  temperature: A value controlling the randomness of the sampling process. Higher values result in more random
44
  """
45
- output = [prompt, input, max_new_tokens, top_k, temperature]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  print(output)
47
  return output
48
 
49
  # 配置具体参数
50
-
 
 
 
 
51
  example_path = "example.json"
52
  # 1024如果不够, 调整为512
53
  max_seq_len = 1024
54
  max_batch_size = 1
55
 
 
 
56
  with open(example_path, 'r') as f:
57
  content = f.read()
58
  example_dict = json.loads(content)
 
2
  import time
3
  import warnings
4
  from pathlib import Path
5
+ from typing import Optional
6
 
7
+ import lightning as L
8
+ import torch
9
+
10
+ # support running without installing as a package
11
+ wd = Path(__file__).parent.parent.resolve()
12
+ sys.path.append(str(wd))
13
+
14
+ from generate import generate
15
+ from lit_llama import Tokenizer
16
+ from lit_llama.adapter import LLaMA
17
+ from lit_llama.utils import EmptyInitOnDevice, lazy_load, llama_model_lookup
18
+ from scripts.prepare_alpaca import generate_prompt
19
 
20
  # 配置hugface环境
21
  from huggingface_hub import hf_hub_download
 
25
  import json
26
 
27
  # os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
28
+ torch.set_float32_matmul_precision("high")
29
+
30
+ def model_load(
31
+ adapter_path: Path = Path("out/adapter/alpaca/lit-llama-adapter-finetuned_15k.pth"),
32
+ pretrained_path: Path = Path("checkpoints/lit-llama/7B/lit-llama.pth"),
33
+ quantize: Optional[str] = "llm.int8",
34
+ ):
35
+
36
+ fabric = L.Fabric(devices=1)
37
+ dtype = torch.bfloat16 if fabric.device.type == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
38
+
39
+ with lazy_load(pretrained_path) as pretrained_checkpoint, lazy_load(adapter_path) as adapter_checkpoint:
40
+ name = llama_model_lookup(pretrained_checkpoint)
41
 
42
+ with EmptyInitOnDevice(
43
+ device=fabric.device, dtype=dtype, quantization_mode=quantize
44
+ ):
45
+ model = LLaMA.from_name(name)
46
+
47
+ # 1. Load the pretrained weights
48
+ model.load_state_dict(pretrained_checkpoint, strict=False)
49
+ # 2. Load the fine-tuned adapter weights
50
+ model.load_state_dict(adapter_checkpoint, strict=False)
51
+
52
+ model.eval()
53
+ model = fabric.setup_module(model)
54
+
55
+ return model
56
 
57
 
58
  def instruct_generate(
 
81
  top_k: The number of top most probable tokens to consider in the sampling process.
82
  temperature: A value controlling the randomness of the sampling process. Higher values result in more random
83
  """
84
+ sample = {"instruction": prompt, "input": input}
85
+ prompt = generate_prompt(sample)
86
+ encoded = tokenizer.encode(prompt, bos=True, eos=False, device=model.device)
87
+ # prompt_length = encoded.size(0)
88
+
89
+ y = generate(
90
+ model,
91
+ idx=encoded,
92
+ max_seq_length=max_new_tokens,
93
+ max_new_tokens=max_new_tokens,
94
+ temperature=temperature,
95
+ top_k=top_k,
96
+ eos_id=tokenizer.eos_id
97
+ )
98
+
99
+ output = tokenizer.decode(y)
100
+ output = output.split("### Response:")[1].strip()
101
  print(output)
102
  return output
103
 
104
  # 配置具体参数
105
+ pretrained_path = hf_hub_download(
106
+ repo_id="xxw/tapa_model", filename="lit-llama.pth")
107
+ tokenizer_path = hf_hub_download(
108
+ repo_id="xxw/tapa_model", filename="tokenizer.model")
109
+ adapter_path = "lit-llama-adapter-finetuned_15k.pth"
110
  example_path = "example.json"
111
  # 1024如果不够, 调整为512
112
  max_seq_len = 1024
113
  max_batch_size = 1
114
 
115
+ model = model_load(adapter_path, pretrained_path)
116
+ tokenizer = Tokenizer(tokenizer_path)
117
  with open(example_path, 'r') as f:
118
  content = f.read()
119
  example_dict = json.loads(content)