Spaces:
Runtime error
Runtime error
File size: 6,116 Bytes
fa02e71 4d88461 fa02e71 8716816 4d88461 fa02e71 8716816 4d88461 8716816 4d88461 8716816 fa02e71 8716816 fa02e71 8716816 fa02e71 4d88461 8716816 4d88461 8520bf2 4d88461 8716816 4d88461 8716816 fa02e71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
from transformers import AutoModel, AutoTokenizer, LlamaTokenizer, LlamaForCausalLM
import gradio as gr
import torch
import os
import io
import sys
import platform
import intel_extension_for_pytorch as ipex
import intel_extension_for_pytorch._C as ipex_core
from cpuinfo import get_cpu_info
from contextlib import redirect_stdout
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
ROOT = '/'
SELF_ROOT = '/proc/self/root'
tokenizer = LlamaTokenizer.from_pretrained(
"lmsys/vicuna-7b-v1.3", trust_remote_code=True
)
model = LlamaForCausalLM.from_pretrained(
"lmsys/vicuna-7b-v1.3", trust_remote_code=True
).to(DEVICE)
model = model.eval()
def in_chroot():
'''
Return true if running in a chroot environment.
'''
try:
root_stat = os.stat(ROOT)
self_stat = os.stat(SELF_ROOT)
except FileNotFoundError as e:
sys.exit(f"ERROR: Failed to stat: {e}")
root_inode = root_stat.st_ino
self_inode = self_stat.st_ino
# Inode 2 is the root inode for most filesystems.
# However, XFS uses 128 for root.
if root_inode not in [2, 128]:
return True
return not (root_inode == self_inode)
def get_features():
'''
Returns a dictionary of all feature:
key: feature name.
value: Boolean showing if feature available.
'''
cpu_info = get_cpu_info()
flags = cpu_info["flags"]
detect_ipex_amx_enabled = lambda: ipex_core._get_current_isa_level() == 'AMX'
detect_ipex_amx_available = (
lambda: ipex_core._get_highest_cpu_support_isa_level() == 'AMX'
)
features = {
'VM': 'hypervisor' in flags,
'TDX TD': 'tdx_guest' in flags,
'AMX available': 'amx_tile' in flags,
'AMX-BF16 available': 'amx_bf16' in flags,
'AMX-INT8 available': 'amx_int8' in flags,
'AVX-VNNI available': 'avx_vnni' in flags,
'AVX512-VNNI available': 'avx512_vnni' in flags,
'AVX512-FP16 available': 'avx512_fp16' in flags,
'AVX512-BF16 available': 'avx512_bf16' in flags,
'AMX IPEX available': detect_ipex_amx_available(),
'AMX IPEX enabled': detect_ipex_amx_enabled(),
}
return features
def get_debug_details():
'''
Return a block of markdown text that shows useful debug
information.
'''
# ipex.version() prints to stdout, so redirect stdout to
# capture the output.
buffer = io.StringIO()
with redirect_stdout(buffer):
ipex.version()
ipex_version_details = buffer.getvalue().replace("\n", ", ")
ipex_current_isa_level = ipex_core._get_current_isa_level()
ipex_max_isa_level = ipex_core._get_highest_cpu_support_isa_level()
ipex_env_var = os.getenv('ATEN_CPU_CAPABILITY')
onednn_env_var = os.getenv('ONEDNN_MAX_CPU_ISA')
with open('/proc/version', 'r') as f:
kernel_version = f.read().rstrip()
in_chroot_result = in_chroot()
cpu_info = get_cpu_info()
flags = cpu_info["flags"]
# Note that rather than using `<details>`, we could use gradio.Accordian(),
# but the markdown version is more visually compact.
md = f"""
<details>
<summary>Click to show debug details</summary>
| Feature | Value |
|-|-|
| Arch | `{cpu_info['arch']}` |
| CPU | `{cpu_info['brand_raw']}` |
| CPU flags | `{flags}` |
| Kernel | `{kernel_version}` |
| Python version | `{sys.version}` (implementation: `{platform.python_implementation()}`) |
| Python version details | `{sys.version_info}` |
| PyTorch version | `{torch.__version__}` |
| IPEX version | `{ipex.ipex_version}` |
| IPEX CPU detected | `{ipex_core._has_cpu()}` |
| IPEX XPU detected | `{ipex_core._has_xpu()}` |
| IPEX version details | `{ipex_version_details}` |
| IPEX env var `ATEN_CPU_CAPABILITY` | `{ipex_env_var}` |
| IPEX current ISA level | `{ipex_current_isa_level}` |
| IPEX max ISA level | `{ipex_max_isa_level}` |
| oneDNN env var `ONEDNN_MAX_CPU_ISA` | `{onednn_env_var}` |
| in chroot | `{in_chroot_result}` |
</details>
"""
return md
def predict(input, history=None):
if history is None:
history = []
new_user_input_ids = tokenizer.encode(
input + tokenizer.eos_token, return_tensors='pt'
)
bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
history = model.generate(
bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id
).tolist()
# convert the tokens to text, and then split the responses into the right format
response = tokenizer.decode(history[0]).split("<|endoftext|>")
response = [
(response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
] # convert to tuples of list
return response, history
with gr.Blocks() as demo:
gr.Markdown(
'''## Confidential HuggingFace Runner
'''
)
state = gr.State([])
chatbot = gr.Chatbot([], elem_id="chatbot").style(height=400)
with gr.Row():
with gr.Column(scale=4):
txt = gr.Textbox(
show_label=False, placeholder="Enter text and press enter"
).style(container=False)
with gr.Column(scale=1):
button = gr.Button("Generate")
txt.submit(predict, [txt, state], [chatbot, state])
button.click(predict, [txt, state], [chatbot, state])
with gr.Row():
features_dict = get_features()
all_features = features_dict.keys()
# Get a list of feature names that are actually set/available
set_features = [key for key in features_dict if features_dict[key]]
gr.CheckboxGroup(
all_features,
label="Features",
# Make the boxes read-only
interactive=False,
# Specify which features were detected
value=set_features,
info="Features detected from environment",
)
with gr.Row():
debug_details = get_debug_details()
gr.Markdown(debug_details)
demo.queue().launch(share=True, server_name="0.0.0.0")
|