Imran1
/

Qwen2.5-72B-Instruct-FP8

Model card Files Files and versions Community

Imran1 commited on Oct 10, 2024

Commit

3c5ff26

·

verified ·

1 Parent(s): a3fcb16

Create serve.py

Files changed (1) hide show

code/serve.py +22 -0

code/serve.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+import subprocess
+def run_vllm_inference():
+    # Set the necessary environment variables
+    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+    # vLLM serve command
+    command = [
+        "vllm", "serve", "Imran1/Qwen2.5-72B-Instruct-FP8",
+        "--tensor-parallel-size", "4",
+        "--dtype", "auto",
+        "--api-key", "token-abc123",
+        "--max-model-len", "2000",
+        "--kv-cache-dtype", "auto"
+    ]
+    # Run the command as a subprocess
+    subprocess.run(command)
+if __name__ == "__main__":
+    run_vllm_inference()