import gradio as gr import subprocess import os import time from transformers import AutoTokenizer, AutoModelForCausalLM import logging # Configure logging logging.basicConfig(level=logging.INFO) # Path to the cloned repository BITNET_REPO_PATH = "/home/user/app/BitNet" SETUP_SCRIPT = os.path.join(BITNET_REPO_PATH, "setup_env.py") INFERENCE_SCRIPT = os.path.join(BITNET_REPO_PATH, "run_inference.py") # Function to set up the environment by running setup.py def setup_bitnet(model_name): try: result = subprocess.run( f"python {SETUP_SCRIPT} --hf-repo {model_name} -q i2_s", shell=True, cwd=BITNET_REPO_PATH, capture_output=True, text=True ) if result.returncode == 0: return "Setup completed successfully!" else: return f"Error in setup: {result.stderr}" except Exception as e: return str(e) # Function to run inference using the `run_inference.py` file def run_inference(model_name, input_text, num_tokens=6): try: # Call the `run_inference.py` script with the model and input start_time = time.time() result = subprocess.run( f"python run_inference.py -m models/Llama3-8B-1.58-100B-tokens/ggml-model-i2_s.gguf -p \"{input_text}\" -n {num_tokens} -temp 0", shell=True, cwd=BITNET_REPO_PATH, capture_output=True, text=True ) end_time = time.time() if result.returncode == 0: inference_time = round(end_time - start_time, 2) return result.stdout, f"Inference took {inference_time} seconds." else: return f"Error during inference: {result.stderr}", None except Exception as e: return str(e), None def run_transformers(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, input_text, num_tokens): if oauth_token is None : return "Error : To Compare please login to your HF account and make sure you have access to the used Llama models" # Load the model and tokenizer dynamically if needed (commented out for performance) tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=oauth_token.token) model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=oauth_token.token) # Encode the input text input_ids = tokenizer.encode(input_text, return_tensors="pt") # Start time for inference start_time = time.time() # Generate output with the specified number of tokens output = model.generate(input_ids, max_length=len(input_ids[0]) + num_tokens, num_return_sequences=1) # Calculate inference time inference_time = time.time() - start_time # Decode the generated output generated_text = tokenizer.decode(output[0], skip_special_tokens=True) return generated_text, f"{inference_time:.2f} seconds" # Gradio Interface def interface(): with gr.Blocks(css=".gr-button {background-color: #5C6BC0; color: white;} .gr-button:hover {background-color: #3F51B5;}") as demo: gr.LoginButton(elem_id="login-button", elem_classes="center-button") gr.Markdown( """

BitNet.cpp Speed Demonstration

Compare the speed and performance of BitNet with Transformers!

""", elem_id="header" ) # Model selection and setup row with gr.Row(): model_dropdown = gr.Dropdown( label="Select Model", choices=["HF1BitLLM/Llama3-8B-1.58-100B-tokens", "1bitLLM/bitnet_b1_58-3B", "1bitLLM/bitnet_b1_58-large"], # Replace with available models value="HF1BitLLM/Llama3-8B-1.58-100B-tokens", interactive=True, elem_id="model-dropdown" ) setup_button = gr.Button("Run Setup", elem_id="setup-button") setup_status = gr.Textbox(label="Setup Status", interactive=False, placeholder="Setup status will appear here...") # Inference row with gr.Row(): num_tokens = gr.Slider(minimum=1, maximum=100, label="Number of Tokens to Generate", value=50, step=1) input_text = gr.Textbox(label="Input Text", placeholder="Enter your input text here...") infer_button = gr.Button("Run Inference", elem_id="infer-button") result_output = gr.Textbox(label="Output", interactive=False, placeholder="Inference output will appear here...") time_output = gr.Textbox(label="Inference Time", interactive=False, placeholder="Inference time will appear here...") # Comparison with Transformers with gr.Row(): transformer_model_dropdown = gr.Dropdown( label="Select Transformers Model", choices=["meta-llama/Llama-3.1-8B", "meta-llama/Llama-3.2-3B", "meta-llama/Llama-3.2-1B"], # Replace with actual models value="meta-llama/Llama-3.1-8B", interactive=True ) compare_button = gr.Button("Run Transformers Inference", elem_id="compare-button") transformer_result_output = gr.Textbox(label="Transformers Output", interactive=False, placeholder="Transformers output will appear here...") transformer_time_output = gr.Textbox(label="Transformers Inference Time", interactive=False, placeholder="Transformers inference time will appear here...") # Actions setup_button.click(setup_bitnet, inputs=model_dropdown, outputs=setup_status) infer_button.click(run_inference, inputs=[model_dropdown, input_text, num_tokens], outputs=[result_output, time_output]) compare_button.click(run_transformers, inputs=[transformer_model_dropdown, input_text, num_tokens], outputs=[transformer_result_output, transformer_time_output]) # Launch the Gradio app return demo demo = interface() demo.launch()