import subprocess import sys import os # Fonction pour installer un package si non présent def install_package(package_name): subprocess.run([sys.executable, "-m", "pip", "install", package_name], check=True) # Vérifiez si torch est installé, sinon installez-le try: import torch except ImportError: print("Torch n'est pas installé. Installation de torch...") install_package("torch") import torch # Vérifiez si transformers est installé, sinon installez-le try: from transformers import ( AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, ) except ImportError: print("Transformers n'est pas installé. Installation de transformers...") install_package("transformers") from transformers import ( AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, ) # Installer flash-attn subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) import gradio as gr from threading import Thread # Obtenir le token d'authentification Hugging Face token = os.getenv("HF_TOKEN") if not token: raise ValueError("Le token d'authentification HF_TOKEN n'est pas défini.") # Charger le modèle et le tokenizer model = AutoModelForCausalLM.from_pretrained( "HaitameLaf/Phi3-Game16bit", token=token, trust_remote_code=True, ) tok = AutoTokenizer.from_pretrained("HaitameLaf/Phi3-Game16bit", token=token) terminators = [tok.eos_token_id] # Vérifier la disponibilité du GPU if torch.cuda.is_available(): device = torch.device("cuda") print(f"Using GPU: {torch.cuda.get_device_name(device)}") else: device = torch.device("cpu") print("Using CPU") model = model.to(device) # Fonction de chat def chat(message, history, temperature, do_sample, max_tokens): chat = [{"role": "user", "content": item[0]} for item in history] chat.extend({"role": "assistant", "content": item[1]} for item in history if item[1]) chat.append({"role": "user", "content": message}) messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) model_inputs = tok([messages], return_tensors="pt").to(device) streamer = TextIteratorStreamer(tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = { "input_ids": model_inputs.input_ids, "streamer": streamer, "max_new_tokens": max_tokens, "do_sample": do_sample, "temperature": temperature, "eos_token_id": terminators, } t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() partial_text = "" for new_text in streamer: partial_text += new_text yield partial_text yield partial_text # Configuration de Gradio demo = gr.ChatInterface( fn=chat, examples=[["Write me a poem about Machine Learning."]], additional_inputs_accordion=gr.Accordion( label="⚙️ Parameters", open=False, render=False ), additional_inputs=[ gr.Slider(minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature"), gr.Checkbox(label="Sampling", value=True), gr.Slider(minimum=128, maximum=4096, step=1, value=512, label="Max new tokens"), ], stop_btn="Stop Generation", title="Chat With LLMs", description="Now Running [HaitameLaf/Phi3-Game16bit](https://huggingface.co/HaitameLaf/Phi3-Game16bit)", ) if __name__ == "__main__": demo.launch()