|
--- |
|
library_name: ctranslate2 |
|
license: mit |
|
base_model: |
|
- microsoft/phi-4 |
|
base_model_relation: quantized |
|
tags: |
|
- ctranslate2 |
|
- AWQ |
|
- phi-4 |
|
- phi |
|
- chat |
|
--- |
|
|
|
### Ctranslate2-based version of Phi-4 |
|
1) First converted to AWQ format using the [cosmopedia-100k dataset](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k) for calibration. |
|
2) Converted to Ctranslate2-compatible format |
|
|
|
### For inference, the main difference is that you do not use the "compute_type" parameter. |
|
|
|
# Example Usage |
|
|
|
<details><summary>Non-Streaming Example:</summary> |
|
|
|
```python |
|
import ctranslate2 |
|
from transformers import AutoTokenizer |
|
|
|
def generate_response(prompt: str, system_message: str, model_path: str) -> str: |
|
generator = ctranslate2.Generator( |
|
model_path, |
|
device="cuda", |
|
) |
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
formatted_prompt = f"""<|im_start|>system<|im_sep|>{system_message}<|im_end|> |
|
<|im_start|>user<|im_sep|>{prompt}<|im_end|> |
|
<|im_start|>assistant<|im_sep|>""" |
|
tokens = tokenizer.tokenize(formatted_prompt) |
|
results = generator.generate_batch( |
|
[tokens], |
|
max_length=1024, |
|
sampling_temperature=0.7, |
|
include_prompt_in_result=False |
|
) |
|
response = tokenizer.decode(results[0].sequences_ids[0], skip_special_tokens=True) |
|
return response |
|
|
|
if __name__ == "__main__": |
|
model_path = "path/to/your/phi-4-ct2-model" |
|
system_message = "You are a helpful AI assistant." |
|
user_prompt = "Write a short poem about a cat." |
|
response = generate_response(user_prompt, system_message, model_path) |
|
print("\nGenerated response:") |
|
print(response) |
|
|
|
``` |
|
</details> |
|
|
|
<details><summary>Streaming Example:</summary> |
|
|
|
```python |
|
import ctranslate2 |
|
from transformers import AutoTokenizer |
|
import sys |
|
|
|
def generate_response(prompt: str, system_message: str, model_path: str) -> None: |
|
generator = ctranslate2.Generator(model_path, device="cuda") |
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
formatted_prompt = f"""<|im_start|>system<|im_sep|>{system_message}<|im_end|> |
|
<|im_start|>user<|im_sep|>{prompt}<|im_end|> |
|
<|im_start|>assistant<|im_sep|>""" |
|
tokens: List[str] = tokenizer.tokenize(formatted_prompt) |
|
for step in generator.generate_tokens([tokens], max_length=1024, sampling_temperature=0.7): |
|
token: str = step.token |
|
if token in tokenizer.eos_token or token in tokenizer.all_special_tokens: |
|
break |
|
decoded_token: str = tokenizer.decode([step.token_id]) |
|
print(decoded_token, end="", flush=True) |
|
|
|
if __name__ == "__main__": |
|
model_path = "path/to/your/phi-4-ct2-model" |
|
system_message = "You are a helpful AI assistant." |
|
user_prompt = "Write a short poem about a cat." |
|
print("\nGenerating response:") |
|
generate_response(user_prompt, system_message, model_path) |
|
``` |
|
</details> |