adamo1139 commited on
Commit
c735b81
·
verified ·
1 Parent(s): 0641089

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +19 -79
README.md CHANGED
@@ -55,90 +55,30 @@ This quant was created using llmcompressor.
55
  Code below.
56
 
57
  ```python
58
- import torch
59
- from datasets import load_dataset
60
  from transformers import AutoTokenizer
 
 
61
 
62
- from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
63
- from llmcompressor.transformers.compression.helpers import (
64
- calculate_offload_device_map,
65
- custom_offload_device_map,
66
- )
67
 
68
- recipe = """
69
- quant_stage:
70
- quant_modifiers:
71
- QuantizationModifier:
72
- ignore: ["lm_head"]
73
- config_groups:
74
- group_0:
75
- weights:
76
- num_bits: 8
77
- type: float
78
- strategy: tensor
79
- dynamic: false
80
- symmetric: true
81
- input_activations:
82
- num_bits: 8
83
- type: float
84
- strategy: tensor
85
- dynamic: false
86
- symmetric: true
87
- targets: ["Linear"]
88
- """
89
-
90
- model_stub = "NousResearch/Hermes-3-Llama-3.1-8B"
91
- model_name = model_stub.split("/")[-1]
92
-
93
- device_map = calculate_offload_device_map(
94
- model_stub, reserve_for_hessians=False, num_gpus=1, torch_dtype="auto"
95
- )
96
 
97
  model = SparseAutoModelForCausalLM.from_pretrained(
98
- model_stub, torch_dtype="auto", device_map=device_map
99
- )
100
- tokenizer = AutoTokenizer.from_pretrained(model_stub)
101
-
102
- output_dir = f"./{model_name}-FP8"
103
-
104
- DATASET_ID = "HuggingFaceH4/ultrachat_200k"
105
- DATASET_SPLIT = "train_sft"
106
- NUM_CALIBRATION_SAMPLES = 512
107
- MAX_SEQUENCE_LENGTH = 4096
108
-
109
- ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
110
- ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
111
-
112
- def preprocess(example):
113
- return {
114
- "text": tokenizer.apply_chat_template(
115
- example["messages"],
116
- tokenize=False,
117
- )
118
- }
119
-
120
- ds = ds.map(preprocess)
121
-
122
- def tokenize(sample):
123
- return tokenizer(
124
- sample["text"],
125
- padding=False,
126
- max_length=MAX_SEQUENCE_LENGTH,
127
- truncation=True,
128
- add_special_tokens=False,
129
- )
130
-
131
- ds = ds.map(tokenize, remove_columns=ds.column_names)
132
-
133
- oneshot(
134
- model=model,
135
- output_dir=output_dir,
136
- dataset=ds,
137
- recipe=recipe,
138
- max_seq_length=MAX_SEQUENCE_LENGTH,
139
- num_calibration_samples=NUM_CALIBRATION_SAMPLES,
140
- save_compressed=True,
141
- )
142
 
143
  ```
144
 
 
55
  Code below.
56
 
57
  ```python
58
+ from llmcompressor.transformers import SparseAutoModelForCausalLM
 
59
  from transformers import AutoTokenizer
60
+ from llmcompressor.transformers import oneshot
61
+ from llmcompressor.modifiers.quantization import QuantizationModifier
62
 
 
 
 
 
 
63
 
64
+ MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  model = SparseAutoModelForCausalLM.from_pretrained(
67
+ MODEL_ID, device_map="auto", torch_dtype="auto")
68
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
69
+
70
+ # Configure the simple PTQ quantization
71
+ recipe = QuantizationModifier(
72
+ targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
73
+
74
+ # Apply the quantization algorithm.
75
+ oneshot(model=model, recipe=recipe)
76
+
77
+ # Save the model.
78
+ SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
79
+ model.save_pretrained(SAVE_DIR)
80
+ tokenizer.save_pretrained(SAVE_DIR)
81
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  ```
84