Upload 10 files

Browse files

Files changed (10) hide show

LICENSE.txt +201 -0
README.md +145 -3
convert_llama_weights_to_hf.py +34 -0
gptq.py +236 -0
llama.py +515 -0
llama_inference.py +123 -0
llama_inference_offload.py +279 -0
neox.py +430 -0
opt.py +446 -0
requirements.txt +11 -0

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,145 @@
----
-license: other
----

+# GPTQ-for-LLaMA
+<img src = https://user-images.githubusercontent.com/64115820/235287009-2d07bba8-9b85-4973-9e06-2a3c28777f06.png width="50%" height="50%">
+4 bits quantization of [LLaMA](https://arxiv.org/abs/2302.13971) using [GPTQ](https://arxiv.org/abs/2210.17323)
+GPTQ is SOTA one-shot weight quantization method
+**It can be used universally, but it is not the [fastest](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/old-cuda) and only supports linux.**
+**Triton only supports Linux, so if you are a Windows user, please use [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install).**
+## News or Update
+**AutoGPTQ-triton, a packaged version of GPTQ with triton, has been integrated into [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ).**
+## Result
+<details>
+<summary>LLaMA-7B(click me)</summary>
+| [LLaMA-7B](https://arxiv.org/abs/2302.13971)       | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) |
+| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- |
+| FP16                                               |  16  |     -      |    13940    |    5.68   |         12.5        |
+| RTN                                                |  4   |     -      |      -      |    6.29   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |     -      |     4740    |    6.09   |          3.5        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |    128     |     4891    |    5.85   |          3.6        |
+| RTN                                                |  3   |     -      |      -      |   25.54   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |     -      |     3852    |    8.07   |          2.7        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |    128     |     4116    |    6.61   |          3.0        |
+</details>
+<details>
+<summary>LLaMA-13B</summary>
+| [LLaMA-13B](https://arxiv.org/abs/2302.13971)      | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) |
+| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- |
+| FP16                                               |  16  |     -      |     OOM     |    5.09   |         24.2        |
+| RTN                                                |  4   |     -      |      -      |    5.53   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |     -      |     8410    |    5.36   |          6.5        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |    128     |     8747    |    5.20   |          6.7        |
+| RTN                                                |  3   |     -      |      -      |   11.40   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |     -      |     6870    |    6.63   |          5.1        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |    128     |     7277    |    5.62   |          5.4        |
+</details>
+<details>
+<summary>LLaMA-33B</summary>
+| [LLaMA-33B](https://arxiv.org/abs/2302.13971)      | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) |
+| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- |
+| FP16                                               |  16  |     -      |     OOM     |    4.10   |         60.5        |
+| RTN                                                |  4   |     -      |      -      |    4.54   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |     -      |    19493    |    4.45   |         15.7        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |    128     |    20570    |    4.23   |         16.3        |
+| RTN                                                |  3   |     -      |      -      |   14.89   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |     -      |    15493    |    5.69   |         12.0        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |    128     |    16566    |    4.80   |         13.0        |
+</details>
+<details>
+<summary>LLaMA-65B</summary>
+| [LLaMA-65B](https://arxiv.org/abs/2302.13971)      | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) |
+| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- |
+| FP16                                               |  16  |     -      |     OOM     |    3.53   |         121.0       |
+| RTN                                                |  4   |     -      |      -      |    3.92   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |     -      |     OOM     |    3.84   |         31.1        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  4   |    128     |     OOM     |    3.65   |         32.3        |
+| RTN                                                |  3   |     -      |      -      |   10.59   |          -          |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |     -      |     OOM     |    5.04   |         23.6        |
+| [GPTQ](https://arxiv.org/abs/2210.17323)           |  3   |    128     |     OOM     |    4.17   |         25.6        |
+</details>
+Quantization requires a large amount of CPU memory. However, the memory required can be reduced by using swap memory.
+Depending on the GPUs/drivers, there may be a difference in performance, which decreases as the model size increases.(https://github.com/IST-DASLab/gptq/issues/1)
+According to [GPTQ paper](https://arxiv.org/abs/2210.17323), As the size of the model increases, the difference in performance between FP16 and GPTQ decreases.
+## Installation
+If you don't have [conda](https://docs.conda.io/en/latest/miniconda.html), install it first.
+```
+conda create --name gptq python=3.9 -y
+conda activate gptq
+conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
+# Or, if you're having trouble with conda, use pip with python3.9:
+# pip3 install torch torchvision torchaudio
+git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa
+cd GPTQ-for-LLaMa
+pip install -r requirements.txt
+```
+## Dependencies
+* `torch`: tested on v2.0.0+cu117
+* `transformers`: tested on v4.28.0.dev0
+* `datasets`: tested on v2.10.1
+* `safetensors`: tested on v0.3.0
+All experiments were run on a single NVIDIA RTX3090.
+# Language Generation
+## LLaMA
+```
+#convert LLaMA to hf
+python convert_llama_weights_to_hf.py --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir ./llama-hf
+# Benchmark language generation with 4-bit LLaMA-7B:
+# Save compressed model
+CUDA_VISIBLE_DEVICES=0 python llama.py ${MODEL_DIR} c4 --wbits 4 --true-sequential --act-order --groupsize 128 --save llama7b-4bit-128g.pt
+# Or save compressed `.safetensors` model
+CUDA_VISIBLE_DEVICES=0 python llama.py ${MODEL_DIR} c4 --wbits 4 --true-sequential --act-order --groupsize 128 --save_safetensors llama7b-4bit-128g.safetensors
+# Benchmark generating a 2048 token sequence with the saved model
+CUDA_VISIBLE_DEVICES=0 python llama.py ${MODEL_DIR} c4 --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --benchmark 2048 --check
+# Benchmark FP16 baseline, note that the model will be split across all listed GPUs
+CUDA_VISIBLE_DEVICES=0,1,2,3,4 python llama.py ${MODEL_DIR} c4 --benchmark 2048 --check
+# model inference with the saved model
+CUDA_VISIBLE_DEVICES=0 python llama_inference.py ${MODEL_DIR} --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --text "this is llama"
+# model inference with the saved model using safetensors loaded direct to gpu
+CUDA_VISIBLE_DEVICES=0 python llama_inference.py ${MODEL_DIR} --wbits 4 --groupsize 128 --load llama7b-4bit-128g.safetensors --text "this is llama" --device=0
+# model inference with the saved model with offload(This is very slow).
+CUDA_VISIBLE_DEVICES=0 python llama_inference_offload.py ${MODEL_DIR} --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --text "this is llama" --pre_layer 16
+It takes about 180 seconds to generate 45 tokens(5->50 tokens) on single RTX3090 based on LLaMa-65B. pre_layer is set to 50.
+```
+Basically, 4-bit quantization and 128 groupsize are recommended.
+You can also export quantization parameters with toml+numpy format.
+```
+CUDA_VISIBLE_DEVICES=0 python llama.py ${MODEL_DIR} c4 --wbits 4 --true-sequential --act-order --groupsize 128 --quant-directory ${TOML_DIR}
+```
+# Acknowledgements
+This code is based on [GPTQ](https://github.com/IST-DASLab/gptq)
+Thanks to Meta AI for releasing [LLaMA](https://arxiv.org/abs/2302.13971), a powerful LLM.
+Triton GPTQ kernel code is based on [GPTQ-triton](https://github.com/fpgaminer/GPTQ-triton)

convert_llama_weights_to_hf.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import argparse
+import os
+from transformers.models.llama.convert_llama_weights_to_hf import write_model, write_tokenizer
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["7B", "13B", "30B", "65B", "tokenizer_only"],
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
+    args = parser.parse_args()
+    if args.model_size != "tokenizer_only":
+        write_model(
+            model_path=os.path.join(args.output_dir, "llama-{}".format(args.model_size).lower()),
+            input_base_path=os.path.join(args.input_dir, args.model_size),
+            model_size=args.model_size,
+        )
+    write_tokenizer(
+        tokenizer_path=os.path.join(args.output_dir, "llama-{}".format(args.model_size).lower()),
+        input_tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"),
+    )
+if __name__ == "__main__":
+    main()

gptq.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import math
+import time
+import torch
+import torch.nn as nn
+import transformers
+import quant
+from texttable import Texttable
+from utils import torch_snr_error
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+class Observer:
+    def __init__(self, topk=32):
+        self.loss_list = []
+        self.topk = topk
+    def submit(self, name: str, layerid: int, gptq, error: float):
+        item = (name, layerid, {'gptq': gptq, 'error': error})
+        if len(self.loss_list) < self.topk:
+            self.loss_list.append(item)
+            return
+        min_error = error
+        min_idx = -1
+        for idx, data in enumerate(self.loss_list):
+            if min_error > data[2]['error']:
+                min_idx = idx
+                min_error = data[2]['error']
+        if min_idx >= 0:
+            self.loss_list[min_idx] = item
+    def print(self):
+        self.loss_list = sorted(self.loss_list, key=lambda s: s[2]['error'], reverse=True)
+        table = Texttable()
+        table.header(['name', 'error'])
+        table.set_cols_dtype(['t', 'f'])
+        for item in self.loss_list:
+            table.add_row([f"{item[0]}.{item[1]}", item[2]['error']])
+        print(table.draw())
+        print('\n')
+    def items(self):
+        return self.loss_list
+class GPTQ:
+    def __init__(self, layer, observe=False):
+        self.layer = layer
+        self.dev = self.layer.weight.device
+        W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
+        self.quantizer = quant.Quantizer()
+        self.observe = observe
+    def add_batch(self, inp, out):
+        # Hessian H = 2 X XT + λ I
+        if self.observe:
+            self.inp1 = inp
+            self.out1 = out
+        else:
+            self.inp1 = None
+            self.out1 = None
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        if isinstance(self.layer, nn.Conv2d):
+            unfold = nn.Unfold(self.layer.kernel_size, dilation=self.layer.dilation, padding=self.layer.padding, stride=self.layer.stride)
+            inp = unfold(inp)
+            inp = inp.permute([1, 0, 2])
+            inp = inp.flatten(1)
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())
+    def print_loss(self, name, q_weight, weight_error, timecost):
+        table = Texttable()
+        name += ' ' * (16 - len(name))
+        table.header(['name', 'weight_error', 'fp_inp_SNR', 'q_inp_SNR', 'time'])
+        # assign weight
+        self.layer.weight.data = q_weight.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype)
+        if self.inp1 is not None:
+            # quantize input to int8
+            quantizer = quant.Quantizer()
+            quantizer.configure(8, perchannel=False, sym=True, mse=False)
+            quantizer.find_params(self.inp1)
+            q_in = quantizer.quantize(self.inp1).type(torch.float16)
+            q_out = self.layer(q_in)
+            # get kinds of SNR
+            q_SNR = torch_snr_error(q_out, self.out1).item()
+            fp_SNR = torch_snr_error(self.layer(self.inp1), self.out1).item()
+        else:
+            q_SNR = '-'
+            fp_SNR = '-'
+        table.add_row([name, weight_error, fp_SNR, q_SNR, timecost])
+        print(table.draw().split('\n')[-2])
+    def fasterquant(self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False, name=''):
+        self.layer.to(self.dev)
+        W = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+        tick = time.time()
+        if not self.quantizer.ready():
+            self.quantizer.find_params(W, weight=True)
+        H = self.H
+        if not self.observe:
+            del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+        if actorder:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+        g_idx = []
+        scale = []
+        zero = []
+        now_idx = 1
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+                if groupsize != -1:
+                    if (i1 + i) % groupsize == 0:
+                        self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True)
+                    if ((i1 + i) // groupsize) - now_idx == -1:
+                        scale.append(self.quantizer.scale)
+                        zero.append(self.quantizer.zero)
+                        now_idx += 1
+                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q)**2 / d**2
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+        torch.cuda.synchronize()
+        error = torch.sum(Losses).item()
+        groupsize = groupsize if groupsize != -1 else self.columns
+        g_idx = [i // groupsize for i in range(self.columns)]
+        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
+        if actorder:
+            invperm = torch.argsort(perm)
+            Q = Q[:, invperm]
+            g_idx = g_idx[invperm]
+        if isinstance(self.layer, transformers.Conv1D):
+            Q = Q.t()
+        self.print_loss(name=name, q_weight=Q, weight_error=error, timecost=(time.time() - tick))
+        if scale == []:
+            scale.append(self.quantizer.scale)
+            zero.append(self.quantizer.zero)
+        scale = torch.cat(scale, dim=1)
+        zero = torch.cat(zero, dim=1)
+        return scale, zero, g_idx, error
+    def free(self):
+        self.inp1 = None
+        self.out1 = None
+        self.H = None
+        self.Losses = None
+        self.Trace = None
+        torch.cuda.empty_cache()

llama.py ADDED Viewed

	@@ -0,0 +1,515 @@

+import argparse
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+import quant
+from gptq import GPTQ, Observer
+from utils import find_layers, DEV, set_seed, get_wikitext2, get_ptb, get_c4, get_ptb_new, get_c4_new, get_loaders, export_quant_table, gen_conditions
+from texttable import Texttable
+def get_llama(model):
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    model = LlamaForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
+    model.seqlen = 2048
+    return model
+@torch.no_grad()
+def llama_sequential(model, dataloader, dev):
+    print('Starting ...')
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    model.model.norm = model.model.norm.to(dev)
+    layers[0] = layers[0].to(dev)
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros((args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev)
+    cache = {'i': 0, 'attention_mask': None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    model.model.norm = model.model.norm.cpu()
+    torch.cuda.empty_cache()
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+    print('Ready.')
+    quantizers = {}
+    observer = Observer()
+    for i in range(len(layers)):
+        print(f'Quantizing layer {i+1}/{len(layers)}..')
+        print('+------------------+--------------+------------+-----------+-------+')
+        print('|       name       | weight_error | fp_inp_SNR | q_inp_SNR | time  |')
+        print('+==================+==============+============+===========+=======+')
+        layer = layers[i].to(dev)
+        full = find_layers(layer)
+        if args.true_sequential:
+            sequential = [['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], ['self_attn.o_proj'], ['mlp.up_proj', 'mlp.gate_proj'], ['mlp.down_proj']]
+        else:
+            sequential = [list(full.keys())]
+        for names in sequential:
+            subset = {n: full[n] for n in names}
+            gptq = {}
+            for name in subset:
+                gptq[name] = GPTQ(subset[name], observe=args.observe)
+                gptq[name].quantizer.configure(args.wbits, perchannel=True, sym=args.sym, mse=False)
+            def add_batch(name):
+                def tmp(_, inp, out):
+                    gptq[name].add_batch(inp[0].data, out.data)
+                return tmp
+            handles = []
+            for name in subset:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
+            for j in range(args.nsamples):
+                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+            for h in handles:
+                h.remove()
+            for name in subset:
+                scale, zero, g_idx, error = gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order, name=name)
+                quantizers['model.layers.%d.%s' % (i, name)] = (gptq[name].quantizer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu(), args.wbits, args.groupsize)
+                if args.observe:
+                    observer.submit(name=name, layerid=i, gptq=gptq[name], error=error)
+                else:
+                    gptq[name].free()
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+        layers[i] = layer.cpu()
+        del layer
+        del gptq
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+        print('+------------------+--------------+------------+-----------+-------+')
+        print('\n')
+    if args.observe:
+        observer.print()
+        conditions = gen_conditions(args.wbits, args.groupsize)
+        for item in observer.items():
+            name = item[0]
+            layerid = item[1]
+            gptq = item[2]['gptq']
+            error = item[2]['error']
+            target = error / 2
+            table = Texttable()
+            table.header(['wbits', 'groupsize', 'error'])
+            table.set_cols_dtype(['i', 'i', 'f'])
+            table.add_row([args.wbits, args.groupsize, error])
+            print('Optimizing {} {} ..'.format(name, layerid))
+            for wbits, groupsize in conditions:
+                if error < target:
+                    # if error dropped 50%, skip
+                    break
+                gptq.quantizer.configure(wbits, perchannel=True, sym=args.sym, mse=False)
+                scale, zero, g_idx, error = gptq.fasterquant(percdamp=args.percdamp, groupsize=groupsize, actorder=args.act_order, name=name)
+                table.add_row([wbits, groupsize, error])
+                quantizers['model.layers.%d.%s' % (layerid, name)] = (gptq.quantizer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu(), wbits, groupsize)
+            print(table.draw())
+            print('\n')
+            gptq.layer.to('cpu')
+            gptq.free()
+    model.config.use_cache = use_cache
+    return quantizers
+@torch.no_grad()
+def llama_eval(model, testenc, dev):
+    print('Evaluating ...')
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.layers
+    model.model.embed_tokens = model.model.embed_tokens.to(dev)
+    layers[0] = layers[0].to(dev)
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros((nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev)
+    cache = {'i': 0, 'attention_mask': None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    layers[0] = layers[0].cpu()
+    model.model.embed_tokens = model.model.embed_tokens.cpu()
+    torch.cuda.empty_cache()
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+        if args.nearest:
+            subset = find_layers(layer)
+            for name in subset:
+                quantizer = quant.Quantizer()
+                quantizer.configure(args.wbits, perchannel=True, sym=args.sym, mse=False)
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantizer.quantize(W).to(next(iter(layer.parameters())).dtype)
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+    if model.model.norm is not None:
+        model.model.norm = model.model.norm.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.norm is not None:
+            hidden_states = model.model.norm(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+    model.config.use_cache = use_cache
+# TODO: perform packing on GPU
+def llama_pack(model, quantizers, wbits, groupsize):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    quant.make_quant_linear(model, quantizers, wbits, groupsize)
+    qlayers = find_layers(model, [quant.QuantLinear])
+    print('Packing ...')
+    for name in qlayers:
+        print(name)
+        quantizers[name], scale, zero, g_idx, _, _ = quantizers[name]
+        qlayers[name].pack(layers[name], scale, zero, g_idx)
+    print('Done.')
+    return model
+def load_quant(model, checkpoint, wbits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True):
+    from transformers import LlamaConfig, LlamaForCausalLM, modeling_utils
+    config = LlamaConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+    torch.set_default_dtype(torch.half)
+    modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = LlamaForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    if eval:
+        model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    quant.make_quant_linear(model, layers, wbits, groupsize)
+    del layers
+    print('Loading model ...')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    if eval:
+        quant.make_quant_attn(model)
+        quant.make_quant_norm(model)
+        if fused_mlp:
+            quant.make_fused_mlp(model)
+    if warmup_autotune:
+        quant.autotune_warmup_linear(model, transpose=not (eval))
+        if eval and fused_mlp:
+            quant.autotune_warmup_fused(model)
+    model.seqlen = 2048
+    print('Done.')
+    return model
+def llama_multigpu(model, gpus, gpu_dist):
+    model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
+    if hasattr(model.model, 'norm') and model.model.norm:
+        model.model.norm = model.model.norm.to(gpus[-1])
+    import copy
+    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
+    cache = {'mask': None}
+    class MoveModule(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+            self.dev = next(iter(self.module.parameters())).device
+        def forward(self, *inp, **kwargs):
+            inp = list(inp)
+            if inp[0].device != self.dev:
+                inp[0] = inp[0].to(self.dev)
+            if cache['mask'] is None or cache['mask'].device != self.dev:
+                cache['mask'] = kwargs['attention_mask'].to(self.dev)
+            kwargs['attention_mask'] = cache['mask']
+            tmp = self.module(*inp, **kwargs)
+            return tmp
+    layers = model.model.layers
+    from math import ceil
+    if not gpu_dist:
+        pergpu = ceil(len(layers) / len(gpus))
+        for i in range(len(layers)):
+            layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
+    else:
+        assigned_gpus = []
+        for i in range(len(gpu_dist)):
+            assigned_gpus = assigned_gpus + [i] * gpu_dist[i]
+        remaining_assignments = len(layers)-len(assigned_gpus)
+        if remaining_assignments > 0:
+            assigned_gpus = assigned_gpus + [-1] * remaining_assignments
+        for i in range(len(layers)):
+            layers[i] = MoveModule(layers[i].to(gpus[assigned_gpus[i]]))
+    model.gpus = gpus
+def benchmark(model, input_ids, check=False):
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
+    torch.cuda.synchronize()
+    cache = {'past': None}
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache['past']:
+                cache['past'][i] = None
+        return tmp
+    for i, layer in enumerate(model.model.layers):
+        layer.register_forward_hook(clear_past(i))
+    print('Benchmarking ...')
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.
+    def sync():
+        if hasattr(model, 'gpus'):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+    max_memory = 0
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+            out = model(input_ids[:, i:i + 1], past_key_values=cache['past'], attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)))
+            sync()
+            times.append(time.time() - tick)
+            print(i, times[-1])
+            if hasattr(model, 'gpus'):
+                mem_allocated = sum(torch.cuda.memory_allocated(gpu) for gpu in model.gpus) / 1024 / 1024
+            else:
+                mem_allocated = torch.cuda.memory_allocated() / 1024 / 1024
+            max_memory = max(max_memory, mem_allocated)
+            if check and i != input_ids.numel() - 1:
+                tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
+            cache['past'] = list(out.past_key_values)
+            del out
+        sync()
+        print('Median:', np.median(times))
+        if check:
+            print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+            print('max memory(MiB):', max_memory)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', type=str, help='llama model to load')
+    parser.add_argument('dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], help='Where to extract calibration data from.')
+    parser.add_argument('--seed', type=int, default=0, help='Seed for sampling the calibration data.')
+    parser.add_argument('--nsamples', type=int, default=128, help='Number of calibration data samples.')
+    parser.add_argument('--percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
+    parser.add_argument('--nearest', action='store_true', help='Whether to run the RTN baseline.')
+    parser.add_argument('--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], help='#bits to use for quantization; use 16 for evaluating base model.')
+    parser.add_argument('--trits', action='store_true', help='Whether to use trits for quantization.')
+    parser.add_argument('--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.')
+    parser.add_argument('--eval', action='store_true', help='evaluate quantized model.')
+    parser.add_argument('--save', type=str, default='', help='Save quantized checkpoint under this name.')
+    parser.add_argument('--save_safetensors', type=str, default='', help='Save quantized `.safetensors` checkpoint under this name.')
+    parser.add_argument('--load', type=str, default='', help='Load quantized model.')
+    parser.add_argument('--benchmark', type=int, default=0, help='Number of tokens to use for benchmarking.')
+    parser.add_argument('--check', action='store_true', help='Whether to compute perplexity during benchmarking for verification.')
+    parser.add_argument('--sym', action='store_true', help='Whether to perform symmetric quantization.')
+    parser.add_argument('--act-order', action='store_true', help='Whether to apply the activation order GPTQ heuristic')
+    parser.add_argument('--true-sequential', action='store_true', help='Whether to run in true sequential model.')
+    parser.add_argument('--new-eval', action='store_true', help='Whether to use the new PTB and C4 eval')
+    parser.add_argument('--layers-dist', type=str, default='', help='Distribution of layers across GPUs. e.g. 2:1:1 for 2 layers on GPU 0, 1 layer on GPU 1, and 1 layer on GPU 2. Any remaining layers will be assigned to your last GPU.')
+    parser.add_argument('--observe',
+                        action='store_true',
+                        help='Auto upgrade layer precision to higher precision, for example int2 to int4, groupsize 128 to 64. \
+            When this feature enabled, `--save` or `--save_safetensors` would be disable.')
+    parser.add_argument('--quant-directory', type=str, default=None, help='Specify the directory for export quantization parameters to toml format. `None` means no export by default.')
+    args = parser.parse_args()
+    if args.layers_dist:
+        gpu_dist = [int(x) for x in args.layers_dist.split(':')]
+    else:
+        gpu_dist = []
+    if type(args.load) is not str:
+        args.load = args.load.as_posix()
+    if args.load:
+        model = load_quant(args.model, args.load, args.wbits, args.groupsize)
+    else:
+        model = get_llama(args.model)
+        model.eval()
+    dataloader, testloader = get_loaders(args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen)
+    if not args.load and args.wbits < 16 and not args.nearest:
+        tick = time.time()
+        quantizers = llama_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+    if args.benchmark:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            llama_multigpu(model, gpus, gpu_dist)
+        else:
+            model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.eval:
+        datasets = ['wikitext2', 'ptb', 'c4']
+        if args.new_eval:
+            datasets = ['wikitext2', 'ptb-new', 'c4-new']
+        for dataset in datasets:
+            dataloader, testloader = get_loaders(dataset, seed=args.seed, model=args.model, seqlen=model.seqlen)
+            print(dataset)
+            llama_eval(model, testloader, DEV)
+    if args.quant_directory is not None:
+        export_quant_table(quantizers, args.quant_directory)
+    if not args.observe and args.save:
+        llama_pack(model, quantizers, args.wbits, args.groupsize)
+        torch.save(model.state_dict(), args.save)
+    if not args.observe and args.save_safetensors:
+        llama_pack(model, quantizers, args.wbits, args.groupsize)
+        from safetensors.torch import save_file as safe_save
+        state_dict = model.state_dict()
+        state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
+        safe_save(state_dict, args.save_safetensors)

llama_inference.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import argparse
+import torch
+import torch.nn as nn
+import quant
+from gptq import GPTQ
+from utils import find_layers, DEV, set_seed, get_wikitext2, get_ptb, get_c4, get_ptb_new, get_c4_new, get_loaders
+import transformers
+from transformers import AutoTokenizer
+def get_llama(model):
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import LlamaForCausalLM
+    model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto')
+    model.seqlen = 2048
+    return model
+def load_quant(model, checkpoint, wbits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True):
+    from transformers import LlamaConfig, LlamaForCausalLM
+    config = LlamaConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = LlamaForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    if eval:
+        model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    quant.make_quant_linear(model, layers, wbits, groupsize)
+    del layers
+    print('Loading model ...')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint), strict=False)
+    else:
+        model.load_state_dict(torch.load(checkpoint), strict=False)
+    if eval:
+        quant.make_quant_attn(model)
+        quant.make_quant_norm(model)
+        if fused_mlp:
+            quant.make_fused_mlp(model)
+    if warmup_autotune:
+        quant.autotune_warmup_linear(model, transpose=not (eval))
+        if eval and fused_mlp:
+            quant.autotune_warmup_fused(model)
+    model.seqlen = 2048
+    print('Done.')
+    return model
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', type=str, help='llama model to load')
+    parser.add_argument('--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], help='#bits to use for quantization; use 16 for evaluating base model.')
+    parser.add_argument('--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.')
+    parser.add_argument('--load', type=str, default='', help='Load quantized model.')
+    parser.add_argument('--text', type=str, help='input text')
+    parser.add_argument('--min_length', type=int, default=10, help='The minimum length of the sequence to be generated.')
+    parser.add_argument('--max_length', type=int, default=50, help='The maximum length of the sequence to be generated.')
+    parser.add_argument('--top_p',
+                        type=float,
+                        default=0.95,
+                        help='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.')
+    parser.add_argument('--temperature', type=float, default=0.8, help='The value used to module the next token probabilities.')
+    parser.add_argument('--device', type=int, default=-1, help='The device used to load the model when using safetensors. Default device is "cpu" or specify, 0,1,2,3,... for GPU device.')
+    args = parser.parse_args()
+    if type(args.load) is not str:
+        args.load = args.load.as_posix()
+    if args.load:
+        model = load_quant(args.model, args.load, args.wbits, args.groupsize)
+    else:
+        model = get_llama(args.model)
+        model.eval()
+    model.to(DEV)
+    tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
+    input_ids = tokenizer.encode(args.text, return_tensors="pt").to(DEV)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            input_ids,
+            do_sample=True,
+            min_length=args.min_length,
+            max_length=args.max_length,
+            top_p=args.top_p,
+            temperature=args.temperature,
+        )
+    print(tokenizer.decode([el.item() for el in generated_ids[0]]))

llama_inference_offload.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import torch
+import torch.nn as nn
+from gptq import GPTQ
+import argparse
+from utils import find_layers, DEV, set_seed, get_wikitext2, get_ptb, get_c4, get_ptb_new, get_c4_new, get_loaders
+import quant
+import transformers
+from transformers import AutoTokenizer
+from transformers.models.llama.modeling_llama import LlamaModel, LlamaConfig
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from typing import List, Optional, Tuple, Union
+from accelerate import cpu_offload_with_hook, load_checkpoint_in_model
+class Offload_LlamaModel(LlamaModel):
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+    def cpu_offload(self, preload):
+        hook = None
+        for cpu_offloaded_model in self.layers[preload:]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, DEV, prev_module_hook=hook)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                `[0, config.n_positions - 1]`.
+                [What are position IDs?](../glossary#position-ids)
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device)
+        attention_mask = self._prepare_decoder_attention_mask(attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length)
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx in range(len(self.layers)):
+            decoder_layer = self.layers[idx]
+            if output_hidden_states:
+                all_hidden_states += (hidden_states, )
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1], )
+            if output_attentions:
+                all_self_attns += (layer_outputs[1], )
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states, )
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+def load_quant(model, checkpoint, wbits, groupsize, pre_layer, fused_mlp=True, warmup_autotune=True):
+    transformers.models.llama.modeling_llama.LlamaModel = Offload_LlamaModel
+    from transformers import LlamaConfig, LlamaForCausalLM
+    config = LlamaConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = LlamaForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['lm_head']:
+        if name in layers:
+            del layers[name]
+    quant.make_quant_linear(model, layers, wbits, groupsize)
+    print('Loading model ...')
+    load_checkpoint_in_model(model, checkpoint, dtype='float16')
+    model.seqlen = 2048
+    if eval:
+        quant.make_quant_attn(model)
+        quant.make_quant_norm(model)
+        if fused_mlp:
+            quant.make_fused_mlp(model)
+    if warmup_autotune:
+        quant.autotune_warmup_linear(model)
+        if fused_mlp:
+            quant.autotune_warmup_fused(model)
+    for i in range(pre_layer):
+        model.model.layers[i].to(DEV)
+    model.model.embed_tokens.to(DEV)
+    model.model.norm.to(DEV)
+    model.lm_head.to(DEV)
+    model.model.cpu_offload(pre_layer)
+    print('Done.')
+    return model
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', type=str, help='llama model to load')
+    parser.add_argument('--wbits', type=int, default=4, choices=[2, 3, 4, 8], help='#bits to use for quantization')
+    parser.add_argument('--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.')
+    parser.add_argument('--load', type=str, default='', help='Load quantized model.')
+    parser.add_argument('--text', type=str, help='input text')
+    parser.add_argument('--min_length', type=int, default=10, help='The minimum length of the sequence to be generated.')
+    parser.add_argument('--max_length', type=int, default=50, help='The maximum length of the sequence to be generated.')
+    parser.add_argument('--top_p',
+                        type=float,
+                        default=0.95,
+                        help='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.')
+    parser.add_argument('--temperature', type=float, default=0.8, help='The value used to module the next token probabilities.')
+    parser.add_argument('--pre_layer', type=int, default=50, help='The number of layers to preload')
+    args = parser.parse_args()
+    if type(args.load) is not str:
+        args.load = args.load.as_posix()
+    model = load_quant(args.model, args.load, args.wbits, args.groupsize, args.pre_layer)
+    tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
+    input_ids = tokenizer.encode(args.text, return_tensors="pt").to(DEV)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            input_ids,
+            do_sample=True,
+            min_length=args.min_length,
+            max_length=args.max_length,
+            top_p=args.top_p,
+            temperature=args.temperature,
+        )
+    print(tokenizer.decode([el.item() for el in generated_ids[0]]))

neox.py ADDED Viewed

	@@ -0,0 +1,430 @@

+import argparse
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+import quant
+from gptq import GPTQ, Observer
+from utils import find_layers, DEV, set_seed, get_wikitext2, get_ptb, get_c4, get_ptb_new, get_c4_new, get_loaders, export_quant_table, gen_conditions
+from texttable import Texttable
+def get_neox(model, seqlen=-1):
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import GPTNeoXForCausalLM
+    model = GPTNeoXForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
+    model.seqlen = seqlen if seqlen != -1 else model.config.max_position_embeddings
+    return model
+@torch.no_grad()
+def neox_sequential(model, dataloader, dev):
+    print('Starting ...')
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.gpt_neox.layers
+    model.gpt_neox.embed_in = model.gpt_neox.embed_in.to(dev)
+    layers[0] = layers[0].to(dev)
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros((args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev)
+    cache = {'i': 0, 'attention_mask': None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    layers[0] = layers[0].cpu()
+    model.gpt_neox.embed_in = model.gpt_neox.embed_in.cpu()
+    torch.cuda.empty_cache()
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+    print('Ready.')
+    quantizers = {}
+    observer = Observer()
+    for i in range(len(layers)):
+        print(f'Quantizing layer {i+1}/{len(layers)}..')
+        print('+------------------+--------------+------------+-----------+-------+')
+        print('|       name       | weight_error | fp_inp_SNR | q_inp_SNR | time  |')
+        print('+==================+==============+============+===========+=======+')
+        layer = layers[i].to(dev)
+        full = find_layers(layer)
+        sequential = [list(full.keys())]
+        for names in sequential:
+            subset = {n: full[n] for n in names}
+            gptq = {}
+            for name in subset:
+                gptq[name] = GPTQ(subset[name], observe=False)
+                gptq[name].quantizer.configure(args.wbits, perchannel=True, sym=args.sym, mse=False)
+            def add_batch(name):
+                def tmp(_, inp, out):
+                    gptq[name].add_batch(inp[0].data, out.data)
+                return tmp
+            handles = []
+            for name in subset:
+                handles.append(subset[name].register_forward_hook(add_batch(name)))
+            for j in range(args.nsamples):
+                outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+            for h in handles:
+                h.remove()
+            for name in subset:
+                scale, zero, g_idx, error = gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order, name=name)
+                quantizers['gpt_neox.layers.%d.%s' % (i, name)] = (gptq[name].quantizer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu(), args.wbits, args.groupsize)
+                gptq[name].free()
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+        layers[i] = layer.cpu()
+        del layer
+        del gptq
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+        print('+------------------+--------------+------------+-----------+-------+')
+        print('\n')
+    model.config.use_cache = use_cache
+    return quantizers
+@torch.no_grad()
+def neox_eval(model, testenc, dev):
+    print('Evaluating ...')
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.gpt_neox.layers
+    model.gpt_neox.embed_in = model.gpt_neox.embed_in.to(dev)
+    layers[0] = layers[0].to(dev)
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros((nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev)
+    cache = {'i': 0, 'attention_mask': None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            cache['position_ids'] = kwargs['position_ids']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    layers[0] = layers[0].cpu()
+    model.gpt_neox.embed_in = model.gpt_neox.embed_in.cpu()
+    torch.cuda.empty_cache()
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    position_ids = cache['position_ids']
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+        if args.nearest:
+            subset = find_layers(layer)
+            for name in subset:
+                quantizer = quant.Quantizer()
+                quantizer.configure(args.wbits, perchannel=True, sym=args.sym, mse=False)
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantizer.quantize(W).to(next(iter(layer.parameters())).dtype)
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+    model.gpt_neox.final_layer_norm = model.gpt_neox.final_layer_norm.to(dev)
+    model.embed_out = model.embed_out.to(dev)
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        hidden_states = model.gpt_neox.final_layer_norm(hidden_states)
+        lm_logits = model.embed_out(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+    model.config.use_cache = use_cache
+# TODO: perform packing on GPU
+def neox_pack(model, quantizers, wbits, groupsize):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    quant.make_quant_linear(model, quantizers, wbits, groupsize)
+    qlayers = find_layers(model, [quant.QuantLinear])
+    print('Packing ...')
+    for name in qlayers:
+        print(name)
+        quantizers[name], scale, zero, g_idx, _, _ = quantizers[name]
+        qlayers[name].pack(layers[name], scale, zero, g_idx)
+    print('Done.')
+    return model
+def load_quant(model, checkpoint, wbits, groupsize=-1, eval=True, warmup_autotune=True):
+    from transformers import GPTNeoXConfig, GPTNeoXForCausalLM, modeling_utils
+    config = GPTNeoXConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+    torch.set_default_dtype(torch.half)
+    modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = GPTNeoXForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    if eval:
+        model = model.eval()
+    layers = find_layers(model)
+    for name in ['embed_in','embed_out']:
+        if name in layers:
+            del layers[name]
+    quant.make_quant_linear(model, layers, wbits, groupsize)
+    del layers
+    print('Loading model ...')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    if warmup_autotune:
+        quant.autotune_warmup_linear(model, transpose=not (eval))
+    model.seqlen = model.config.max_position_embeddings
+    print('Done.')
+    return model
+def neox_multigpu(model, gpus):
+    model.gpt_neox.embed_in  = model.gpt_neox.embed_in.to(gpus[0])
+    model.gpt_neox.final_layer_norm = model.gpt_neox.final_layer_norm.to(gpus[-1])
+    import copy
+    model.embed_out = copy.deepcopy(model.embed_out).to(gpus[-1])
+    cache = {'mask': None}
+    class MoveModule(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+            self.dev = next(iter(self.module.parameters())).device
+        def forward(self, *inp, **kwargs):
+            inp = list(inp)
+            if inp[0].device != self.dev:
+                inp[0] = inp[0].to(self.dev)
+            if cache['mask'] is None or cache['mask'].device != self.dev:
+                cache['mask'] = kwargs['attention_mask'].to(self.dev)
+            kwargs['attention_mask'] = cache['mask']
+            tmp = self.module(*inp, **kwargs)
+            return tmp
+    layers = model.gpt_neox.layers
+    pergpu = math.ceil(len(layers) / len(gpus))
+    for i in range(len(layers)):
+        layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
+    model.gpus = gpus
+def benchmark(model, input_ids, check=False):
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
+    torch.cuda.synchronize()
+    cache = {'past': None}
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache['past']:
+                cache['past'][i] = None
+        return tmp
+    for i, layer in enumerate(model.gpt_neox.layers):
+        layer.register_forward_hook(clear_past(i))
+    print('Benchmarking ...')
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.
+    def sync():
+        if hasattr(model, 'gpus'):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+    max_memory = 0
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+            out = model(input_ids[:, i:i + 1], past_key_values=cache['past'], attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)))
+            sync()
+            times.append(time.time() - tick)
+            print(i, times[-1])
+            max_memory = max(max_memory, torch.cuda.memory_allocated() / 1024 / 1024)
+            if check and i != input_ids.numel() - 1:
+                tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
+            cache['past'] = list(out.past_key_values)
+            del out
+        sync()
+        print('Median:', np.median(times))
+        if check:
+            print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+            print('max memory(MiB):', max_memory)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', type=str, help='llama model to load')
+    parser.add_argument('dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], help='Where to extract calibration data from.')
+    parser.add_argument('--seed', type=int, default=0, help='Seed for sampling the calibration data.')
+    parser.add_argument('--nsamples', type=int, default=128, help='Number of calibration data samples.')
+    parser.add_argument('--percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
+    parser.add_argument('--nearest', action='store_true', help='Whether to run the RTN baseline.')
+    parser.add_argument('--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], help='bits to use for quantization; use 16 for evaluating base model.')
+    parser.add_argument('--seqlen', type=int, default=-1, help='seqlen to use for quantization; default uses full seqlen')
+    parser.add_argument('--trits', action='store_true', help='Whether to use trits for quantization.')
+    parser.add_argument('--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.')
+    parser.add_argument('--eval', action='store_true', help='evaluate quantized model.')
+    parser.add_argument('--save', type=str, default='', help='Save quantized checkpoint under this name.')
+    parser.add_argument('--save_safetensors', type=str, default='', help='Save quantized `.safetensors` checkpoint under this name.')
+    parser.add_argument('--load', type=str, default='', help='Load quantized model.')
+    parser.add_argument('--benchmark', type=int, default=0, help='Number of tokens to use for benchmarking.')
+    parser.add_argument('--check', action='store_true', help='Whether to compute perplexity during benchmarking for verification.')
+    parser.add_argument('--sym', action='store_true', help='Whether to perform symmetric quantization.')
+    parser.add_argument('--act-order', action='store_true', help='Whether to apply the activation order GPTQ heuristic')
+    parser.add_argument('--new-eval', action='store_true', help='Whether to use the new PTB and C4 eval')
+    args = parser.parse_args()
+    if type(args.load) is not str:
+        args.load = args.load.as_posix()
+    if args.load:
+        model = load_quant(args.model, args.load, args.wbits, args.groupsize)
+    else:
+        model = get_neox(args.model)
+        model.eval()
+    dataloader, testloader = get_loaders(args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen)
+    if not args.load and args.wbits < 16 and not args.nearest:
+        tick = time.time()
+        quantizers = neox_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+    if args.benchmark:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            neox_multigpu(model, gpus)
+        else:
+            model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.eval:
+        datasets = ['wikitext2', 'ptb', 'c4']
+        if args.new_eval:
+            datasets = ['wikitext2', 'ptb-new', 'c4-new']
+        for dataset in datasets:
+            dataloader, testloader = get_loaders(dataset, seed=args.seed, model=args.model, seqlen=model.seqlen)
+            print(dataset)
+            neox_eval(model, testloader, DEV)
+    if args.save:
+        neox_pack(model, quantizers, args.wbits, args.groupsize)
+        torch.save(model.state_dict(), args.save)
+    if args.save_safetensors:
+        neox_pack(model, quantizers, args.wbits, args.groupsize)
+        from safetensors.torch import save_file as safe_save
+        state_dict = model.state_dict()
+        state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
+        safe_save(state_dict, args.save_safetensors)

opt.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import time
+import torch
+import torch.nn as nn
+import argparse
+import transformers
+from gptq import GPTQ
+from utils import find_layers, DEV, set_seed, get_wikitext2, get_ptb, get_c4, get_ptb_new, get_c4_new, get_loaders
+import quant
+def get_opt(model):
+    import torch
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    from transformers import OPTForCausalLM
+    model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')
+    model.seqlen = model.config.max_position_embeddings
+    return model
+@torch.no_grad()
+def opt_sequential(model, dataloader, dev):
+    print('Starting ...')
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
+    layers[0] = layers[0].to(dev)
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros((args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev)
+    cache = {'i': 0, 'attention_mask': None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for batch in dataloader:
+        try:
+            model(batch[0].to(dev))
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    print('Ready.')
+    quantizers = {}
+    for i in range(len(layers)):
+        layer = layers[i].to(dev)
+        subset = find_layers(layer)
+        gptq = {}
+        for name in subset:
+            gptq[name] = GPTQ(subset[name])
+            gptq[name].quantizer = quant.Quantizer()
+            gptq[name].quantizer.configure(args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits)
+        def add_batch(name):
+            def tmp(_, inp, out):
+                gptq[name].add_batch(inp[0].data, out.data)
+            return tmp
+        handles = []
+        for name in subset:
+            handles.append(subset[name].register_forward_hook(add_batch(name)))
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        for h in handles:
+            h.remove()
+        for name in subset:
+            print(f'Quantizing {name} in layer {i+1}/{len(layers)}...')
+            scale, zero, g_idx, _ = gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
+            quantizers['model.decoder.layers.%d.%s' % (i, name)] = (gptq[name].quantizer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu())
+            gptq[name].free()
+        for j in range(args.nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        del gptq
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+    model.config.use_cache = use_cache
+    return quantizers
+@torch.no_grad()
+def opt_eval(model, testenc, dev):
+    print('Evaluating ...')
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // model.seqlen
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
+    layers[0] = layers[0].to(dev)
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros((nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev)
+    cache = {'i': 0, 'attention_mask': None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache['i']] = inp
+            cache['i'] += 1
+            cache['attention_mask'] = kwargs['attention_mask']
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+    outs = torch.zeros_like(inps)
+    attention_mask = cache['attention_mask']
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+        if args.nearest:
+            subset = find_layers(layer)
+            for name in subset:
+                quantizer = quant.Quantizer()
+                quantizer.configure(args.wbits, perchannel=True, sym=args.sym, mse=False)
+                W = subset[name].weight.data
+                quantizer.find_params(W, weight=True)
+                subset[name].weight.data = quantizer.quantize(W).to(next(iter(layer.parameters())).dtype)
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+    if model.model.decoder.final_layer_norm is not None:
+        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
+    if model.model.decoder.project_out is not None:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.decoder.final_layer_norm is not None:
+            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
+        if model.model.decoder.project_out is not None:
+            hidden_states = model.model.decoder.project_out(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * model.seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
+    print(ppl.item())
+    model.config.use_cache = use_cache
+# TODO: perform packing on GPU
+def opt_pack(model, quantizers, wbits, groupsize):
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    quant.make_quant_linear(model, quantizers, wbits, groupsize)
+    qlayers = find_layers(model, [quant.QuantLinear])
+    print('Packing ...')
+    for name in qlayers:
+        print(name)
+        quantizers[name], scale, zero, g_idx = quantizers[name]
+        qlayers[name].pack(layers[name], scale, zero, g_idx)
+    print('Done.')
+    return model
+def load_quant(model, checkpoint, wbits, groupsize=-1, eval=True, warmup_autotune=True):
+    from transformers import OPTConfig, OPTForCausalLM
+    config = OPTConfig.from_pretrained(model)
+    def noop(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = OPTForCausalLM(config)
+    torch.set_default_dtype(torch.float)
+    model = model.eval()
+    layers = find_layers(model)
+    for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']:
+        if name in layers:
+            del layers[name]
+    quant.make_quant_linear(model, layers, wbits, groupsize)
+    del layers
+    print('Loading model ...')
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint))
+    else:
+        model.load_state_dict(torch.load(checkpoint))
+    if warmup_autotune:
+        quant.autotune_warmup_linear(model, transpose=not (eval))
+    model.seqlen = model.config.max_position_embeddings
+    print('Done.')
+    return model
+def opt_multigpu(model, gpus):
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0])
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0])
+    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0])
+    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1])
+    if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm:
+        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1])
+    import copy
+    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
+    cache = {'mask': None}
+    class MoveModule(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+            self.dev = next(iter(self.module.parameters())).device
+        def forward(self, *inp, **kwargs):
+            inp = list(inp)
+            if inp[0].device != self.dev:
+                inp[0] = inp[0].to(self.dev)
+            if cache['mask'] is None or cache['mask'].device != self.dev:
+                cache['mask'] = kwargs['attention_mask'].to(self.dev)
+            kwargs['attention_mask'] = cache['mask']
+            tmp = self.module(*inp, **kwargs)
+            return tmp
+    layers = model.model.decoder.layers
+    pergpu = math.ceil(len(layers) / len(gpus))
+    for i in range(len(layers)):
+        layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
+    model.gpus = gpus
+def benchmark(model, input_ids, check=False):
+    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
+    torch.cuda.synchronize()
+    cache = {'past': None}
+    def clear_past(i):
+        def tmp(layer, inp, out):
+            if cache['past']:
+                cache['past'][i] = None
+        return tmp
+    for i, layer in enumerate(model.model.decoder.layers):
+        layer.register_forward_hook(clear_past(i))
+    print('Benchmarking ...')
+    if check:
+        loss = nn.CrossEntropyLoss()
+        tot = 0.
+    def sync():
+        if hasattr(model, 'gpus'):
+            for gpu in model.gpus:
+                torch.cuda.synchronize(gpu)
+        else:
+            torch.cuda.synchronize()
+    with torch.no_grad():
+        attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
+        times = []
+        for i in range(input_ids.numel()):
+            tick = time.time()
+            out = model(input_ids[:, i].reshape(-1), past_key_values=cache['past'], attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)))
+            sync()
+            times.append(time.time() - tick)
+            print(i, times[-1])
+            if check and i != input_ids.numel() - 1:
+                tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float()
+            cache['past'] = list(out.past_key_values)
+            del out
+        sync()
+        import numpy as np
+        print('Median:', np.median(times))
+        if check:
+            print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', type=str, help='OPT model to load; pass `facebook/opt-X`.')
+    parser.add_argument('dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], help='Where to extract calibration data from.')
+    parser.add_argument('--seed', type=int, default=0, help='Seed for sampling the calibration data.')
+    parser.add_argument('--nsamples', type=int, default=128, help='Number of calibration data samples.')
+    parser.add_argument('--percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.')
+    parser.add_argument('--nearest', action='store_true', help='Whether to run the RTN baseline.')
+    parser.add_argument('--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], help='#bits to use for quantization; use 16 for evaluating base model.')
+    parser.add_argument('--trits', action='store_true', help='Whether to use trits for quantization.')
+    parser.add_argument('--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.')
+    parser.add_argument('--eval', action='store_true', help='evaluate quantized model.')
+    parser.add_argument('--save', type=str, default='', help='Save quantized checkpoint under this name.')
+    parser.add_argument('--save_safetensors', type=str, default='', help='Save quantized `.safetensors` checkpoint under this name.')
+    parser.add_argument('--load', type=str, default='', help='Load quantized model.')
+    parser.add_argument('--benchmark', type=int, default=0, help='Number of tokens to use for benchmarking.')
+    parser.add_argument('--check', action='store_true', help='Whether to compute perplexity during benchmarking for verification.')
+    parser.add_argument('--sym', action='store_true', help='Whether to perform symmetric quantization.')
+    parser.add_argument('--act-order', action='store_true', help='Whether to apply the activation order GPTQ heuristic')
+    parser.add_argument('--new-eval', action='store_true', help='Whether to use the new PTB and C4 eval')
+    args = parser.parse_args()
+    if type(args.load) is not str:
+        args.load = args.load.as_posix()
+    if args.load:
+        model = load_quant(args.model, args.load, args.wbits, args.groupsize)
+    else:
+        model = get_opt(args.model)
+        model.eval()
+    dataloader, testloader = get_loaders(args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen)
+    if not args.load and args.wbits < 16 and not args.nearest:
+        tick = time.time()
+        quantizers = opt_sequential(model, dataloader, DEV)
+        print(time.time() - tick)
+    if args.benchmark:
+        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+        if len(gpus) > 1:
+            opt_multigpu(model, gpus)
+        else:
+            model = model.to(DEV)
+        if args.benchmark:
+            input_ids = next(iter(dataloader))[0][:, :args.benchmark]
+            benchmark(model, input_ids, check=args.check)
+    if args.eval:
+        datasets = ['wikitext2', 'ptb', 'c4']
+        if args.new_eval:
+            datasets = ['wikitext2', 'ptb-new', 'c4-new']
+        for dataset in datasets:
+            dataloader, testloader = get_loaders(dataset, seed=args.seed, model=args.model, seqlen=model.seqlen)
+            print(dataset)
+            opt_eval(model, testloader, DEV)
+    if args.save:
+        opt_pack(model, quantizers, args.wbits, args.groupsize)
+        torch.save(model.state_dict(), args.save)
+    if args.save_safetensors:
+        opt_pack(model, quantizers, args.wbits, args.groupsize)
+        from safetensors.torch import save_file as safe_save
+        state_dict = model.state_dict()
+        state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
+        safe_save(state_dict, args.save_safetensors)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+safetensors==0.3.0
+datasets==2.10.1
+sentencepiece
+git+https://github.com/huggingface/transformers
+accelerate==0.17.1
+triton==2.0.0
+texttable
+toml
+numpy
+protobuf==3.20.2