diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..514a6ee5145fbc67f57df69be14955bfa0c0b7ea
--- /dev/null
+++ b/modules/AutoGPTQ_loader.py
@@ -0,0 +1,74 @@
+from pathlib import Path
+
+from accelerate.utils import is_xpu_available
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+
+import modules.shared as shared
+from modules.logging_colors import logger
+from modules.models import get_max_memory_dict
+
+
+def load_quantized(model_name):
+    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
+    pt_path = None
+
+    # Find the model checkpoint
+    if shared.args.checkpoint:
+        pt_path = Path(shared.args.checkpoint)
+    else:
+        for ext in ['.safetensors', '.pt', '.bin']:
+            found = list(path_to_model.glob(f"*{ext}"))
+            if len(found) > 0:
+                if len(found) > 1:
+                    logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
+
+                pt_path = found[-1]
+                break
+
+    if pt_path is None:
+        logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")
+        return
+
+    use_safetensors = pt_path.suffix == '.safetensors'
+    if not (path_to_model / "quantize_config.json").exists():
+        quantize_config = BaseQuantizeConfig(
+            bits=bits if (bits := shared.args.wbits) > 0 else 4,
+            group_size=gs if (gs := shared.args.groupsize) > 0 else -1,
+            desc_act=shared.args.desc_act
+        )
+    else:
+        quantize_config = None
+
+    # Define the params for AutoGPTQForCausalLM.from_quantized
+    params = {
+        'model_basename': pt_path.stem,
+        'device': "xpu:0" if is_xpu_available() else "cuda:0" if not shared.args.cpu else "cpu",
+        'use_triton': shared.args.triton,
+        'inject_fused_attention': not shared.args.no_inject_fused_attention,
+        'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
+        'use_safetensors': use_safetensors,
+        'trust_remote_code': shared.args.trust_remote_code,
+        'max_memory': get_max_memory_dict(),
+        'quantize_config': quantize_config,
+        'use_cuda_fp16': not shared.args.no_use_cuda_fp16,
+        'disable_exllama': shared.args.disable_exllama,
+        'disable_exllamav2': shared.args.disable_exllamav2,
+    }
+
+    logger.info(f"The AutoGPTQ params are: {params}")
+    model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
+
+    # These lines fix the multimodal extension when used with AutoGPTQ
+    if hasattr(model, 'model'):
+        if not hasattr(model, 'dtype'):
+            if hasattr(model.model, 'dtype'):
+                model.dtype = model.model.dtype
+
+        if hasattr(model.model, 'model') and hasattr(model.model.model, 'embed_tokens'):
+            if not hasattr(model, 'embed_tokens'):
+                model.embed_tokens = model.model.model.embed_tokens
+
+            if not hasattr(model.model, 'embed_tokens'):
+                model.model.embed_tokens = model.model.model.embed_tokens
+
+    return model
diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..601c58f30a754fc01812f301d85fd3fc890e3765
--- /dev/null
+++ b/modules/GPTQ_loader.py
@@ -0,0 +1,171 @@
+import inspect
+import re
+from pathlib import Path
+
+import accelerate
+import torch
+import transformers
+from accelerate.utils import is_xpu_available
+from gptq_for_llama import llama_inference_offload
+from gptq_for_llama.modelutils import find_layers
+from gptq_for_llama.quant import make_quant
+from transformers import AutoConfig, AutoModelForCausalLM
+
+import modules.shared as shared
+from modules.logging_colors import logger
+
+
+# This function is a replacement for the load_quant function in the
+# GPTQ-for_LLaMa repository. It supports more models and branches.
+def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=None, kernel_switch_threshold=128, eval=True):
+    exclude_layers = exclude_layers or ['lm_head']
+
+    def noop(*args, **kwargs):
+        pass
+
+    config = AutoConfig.from_pretrained(model, trust_remote_code=shared.args.trust_remote_code)
+    torch.nn.init.kaiming_uniform_ = noop
+    torch.nn.init.uniform_ = noop
+    torch.nn.init.normal_ = noop
+
+    torch.set_default_dtype(torch.half)
+    transformers.modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = AutoModelForCausalLM.from_config(config, trust_remote_code=shared.args.trust_remote_code)
+    torch.set_default_dtype(torch.float)
+    if eval:
+        model = model.eval()
+
+    layers = find_layers(model)
+    for name in exclude_layers:
+        if name in layers:
+            del layers[name]
+
+    gptq_args = inspect.getfullargspec(make_quant).args
+
+    make_quant_kwargs = {
+        'module': model,
+        'names': layers,
+        'bits': wbits,
+    }
+    if 'groupsize' in gptq_args:
+        make_quant_kwargs['groupsize'] = groupsize
+    if 'faster' in gptq_args:
+        make_quant_kwargs['faster'] = faster_kernel
+    if 'kernel_switch_threshold' in gptq_args:
+        make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
+
+    make_quant(**make_quant_kwargs)
+
+    del layers
+    if checkpoint.endswith('.safetensors'):
+        from safetensors.torch import load_file as safe_load
+        model.load_state_dict(safe_load(checkpoint), strict=False)
+    else:
+        model.load_state_dict(torch.load(checkpoint, weights_only=True), strict=False)
+
+    model.seqlen = 2048
+    return model
+
+
+# Used to locate the .pt/.safetensors quantized file
+def find_quantized_model_file(model_name):
+    if shared.args.checkpoint:
+        return Path(shared.args.checkpoint)
+
+    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
+    pt_path = None
+    priority_name_list = [
+        Path(f'{shared.args.model_dir}/{model_name}{hyphen}{shared.args.wbits}bit{group}{ext}')
+        for group in ([f'-{shared.args.groupsize}g', ''] if shared.args.groupsize > 0 else [''])
+        for ext in ['.safetensors', '.pt']
+        for hyphen in ['-', f'/{model_name}-', '/']
+    ]
+
+    for path in priority_name_list:
+        if path.exists():
+            pt_path = path
+            break
+
+    # If the model hasn't been found with a well-behaved name, pick the last .pt
+    # or the last .safetensors found in its folder as a last resort
+    if not pt_path:
+        for ext in ['.pt', '.safetensors']:
+            found = list(path_to_model.glob(f"*{ext}"))
+            if len(found) > 0:
+                if len(found) > 1:
+                    logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
+
+                pt_path = found[-1]
+                break
+
+    return pt_path
+
+
+# The function that loads the model in modules/models.py
+def load_quantized(model_name):
+    if shared.args.model_type is None:
+        logger.error("The model could not be loaded because its type could not be inferred from its name.")
+        logger.error("Please specify the type manually using the --model_type argument.")
+        return None
+
+    # Select the appropriate load_quant function
+    model_type = shared.args.model_type.lower()
+    if shared.args.pre_layer and model_type == 'llama':
+        load_quant = llama_inference_offload.load_quant
+    elif model_type in ('llama', 'opt', 'gptj'):
+        if shared.args.pre_layer:
+            logger.warning("Ignoring --pre_layer because it only works for llama model type.")
+
+        load_quant = _load_quant
+    else:
+        logger.error("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported")
+        exit()
+
+    # Find the quantized model weights file (.pt/.safetensors)
+    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
+    pt_path = find_quantized_model_file(model_name)
+    if not pt_path:
+        logger.error("Could not find the quantized model in .pt or .safetensors format. Exiting.")
+        exit()
+    else:
+        logger.info(f"Found the following quantized model: {pt_path}")
+
+    # qwopqwop200's offload
+    if model_type == 'llama' and shared.args.pre_layer:
+        if len(shared.args.pre_layer) == 1:
+            pre_layer = shared.args.pre_layer[0]
+        else:
+            pre_layer = shared.args.pre_layer
+
+        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, pre_layer)
+    else:
+        threshold = False if model_type == 'gptj' else 128
+        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)
+
+        # accelerate offload (doesn't work properly)
+        if shared.args.gpu_memory or torch.cuda.device_count() > 1 or (is_xpu_available() and torch.xpu.device_count() > 1):
+            if shared.args.gpu_memory:
+                memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
+                max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
+                max_memory = {}
+                for i in range(len(memory_map)):
+                    max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
+
+                max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
+            else:
+                max_memory = accelerate.utils.get_balanced_memory(model)
+
+            device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
+            logger.info("Using the following device map for the quantized model:", device_map)
+            # https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model
+            model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True)
+
+        # No offload
+        elif not shared.args.cpu:
+            if is_xpu_available():
+                model = model.to(torch.device("xpu:0"))
+            else:
+                model = model.to(torch.device('cuda:0'))
+
+    return model
diff --git a/modules/LoRA.py b/modules/LoRA.py
new file mode 100644
index 0000000000000000000000000000000000000000..15132f4ee6527cfb4d86d48ccd7a286c4e0bec55
--- /dev/null
+++ b/modules/LoRA.py
@@ -0,0 +1,153 @@
+from pathlib import Path
+
+import torch
+from peft import PeftModel
+from transformers import is_torch_xpu_available
+
+import modules.shared as shared
+from modules.logging_colors import logger
+from modules.models import reload_model
+
+
+def add_lora_to_model(lora_names):
+    if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ':
+        add_lora_autogptq(lora_names)
+    elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader in ['ExLlamav2', 'ExLlamav2_HF']:
+        add_lora_exllamav2(lora_names)
+    else:
+        add_lora_transformers(lora_names)
+
+
+def get_lora_path(lora_name):
+    p = Path(lora_name)
+    if p.exists():
+        lora_name = p.parts[-1]
+
+    return Path(f"{shared.args.lora_dir}/{lora_name}")
+
+
+def add_lora_exllamav2(lora_names):
+
+    from exllamav2 import ExLlamaV2Lora
+
+    if isinstance(shared.model.loras, list):
+        for lora in shared.model.loras:
+            lora.unload()
+
+    if len(lora_names) > 0:
+        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join(lora_names)))
+        shared.model.loras = []
+        for lora_name in lora_names:
+            lora_path = get_lora_path(lora_name)
+            if shared.model.__class__.__name__ == 'Exllamav2Model':
+                lora = ExLlamaV2Lora.from_directory(shared.model.model, str(lora_path))
+            else:
+                lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path))
+
+            shared.model.loras.append(lora)
+
+        shared.lora_names = lora_names
+    else:
+        shared.lora_names = []
+        shared.model.loras = None
+
+
+def add_lora_autogptq(lora_names):
+    '''
+    Adapted from https://github.com/Ph0rk0z/text-generation-webui-testing
+    '''
+
+    try:
+        from auto_gptq import get_gptq_peft_model
+        from auto_gptq.utils.peft_utils import GPTQLoraConfig
+    except:
+        logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.")
+        return
+
+    if len(lora_names) == 0:
+        reload_model()
+
+        shared.lora_names = []
+        return
+    else:
+        if len(lora_names) > 1:
+            logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
+        if not shared.args.no_inject_fused_attention:
+            logger.warning('Fused Atttention + AutoGPTQ may break Lora loading. Disable it.')
+
+        peft_config = GPTQLoraConfig(
+            inference_mode=True,
+        )
+
+        lora_path = get_lora_path(lora_names[0])
+        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
+        shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
+        shared.lora_names = [lora_names[0]]
+        return
+
+
+def add_lora_transformers(lora_names):
+    prior_set = set(shared.lora_names)
+    added_set = set(lora_names) - prior_set
+    removed_set = prior_set - set(lora_names)
+
+    # If no LoRA needs to be added or removed, exit
+    if len(added_set) == 0 and len(removed_set) == 0:
+        return
+
+    # Add a LoRA when another LoRA is already present
+    if len(removed_set) == 0 and len(prior_set) > 0 and "__merged" not in shared.model.peft_config.keys():
+        logger.info(f"Adding the LoRA(s) named {added_set} to the model")
+        for lora in added_set:
+            shared.model.load_adapter(get_lora_path(lora), lora)
+
+        if len(lora_names) > 1:
+            merge_loras()
+
+        shared.lora_names = lora_names
+        return
+
+    # If any LoRA needs to be removed, start over
+    if len(removed_set) > 0:
+        shared.model = shared.model.unload()
+
+    if len(lora_names) > 0:
+        params = {}
+        if not shared.args.cpu:
+            if shared.args.load_in_4bit or shared.args.load_in_8bit:
+                params['peft_type'] = shared.model.dtype
+            else:
+                params['dtype'] = shared.model.dtype
+                if hasattr(shared.model, "hf_device_map"):
+                    params['device_map'] = {"base_model.model." + k: v for k, v in shared.model.hf_device_map.items()}
+
+        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join(lora_names)))
+        shared.model = PeftModel.from_pretrained(shared.model, get_lora_path(lora_names[0]), adapter_name=lora_names[0], **params)
+        for lora in lora_names[1:]:
+            shared.model.load_adapter(get_lora_path(lora), lora)
+
+        if len(lora_names) > 1:
+            merge_loras()
+
+        if not shared.args.load_in_8bit and not shared.args.cpu:
+            shared.model.half()
+            if not hasattr(shared.model, "hf_device_map"):
+                if torch.backends.mps.is_available():
+                    device = torch.device('mps')
+                    shared.model = shared.model.to(device)
+                elif is_torch_xpu_available():
+                    device = torch.device("xpu:0")
+                    shared.model = shared.model.to(device)
+                else:
+                    shared.model = shared.model.cuda()
+
+    shared.lora_names = lora_names
+
+
+def merge_loras():
+    if len(list({shared.model.peft_config[adapter].r for adapter in shared.model.peft_config.keys()})) > 1:
+        logger.warning("The loaded LoRAs cannot be merged, as they have dissimilar ranks. Only the first one will be active.")
+        return
+
+    shared.model.add_weighted_adapter(shared.lora_names, [1] * len(shared.lora_names), "__merged")
+    shared.model.set_adapter("__merged")
diff --git a/modules/RoPE.py b/modules/RoPE.py
new file mode 100644
index 0000000000000000000000000000000000000000..31163a331eda79e3548024a65d9cd8c4a1752297
--- /dev/null
+++ b/modules/RoPE.py
@@ -0,0 +1,18 @@
+def get_alpha_value(alpha, base):
+    '''
+    Gets alpha_value from alpha_value and rope_freq_base
+    '''
+    if base > 0:
+        return (base / 10000.) ** (63 / 64.)
+    else:
+        return alpha
+
+
+def get_rope_freq_base(alpha, base):
+    '''
+    Gets rope_freq_base from alpha_value and rope_freq_base
+    '''
+    if base > 0:
+        return base
+    else:
+        return 10000 * alpha ** (64 / 63.)
diff --git a/modules/__pycache__/LoRA.cpython-311.pyc b/modules/__pycache__/LoRA.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcc9a563032a72da4381d4fba7b134f094265e00
Binary files /dev/null and b/modules/__pycache__/LoRA.cpython-311.pyc differ
diff --git a/modules/__pycache__/RoPE.cpython-311.pyc b/modules/__pycache__/RoPE.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79290630f32459b0080702d89c0f6c4f566f98b5
Binary files /dev/null and b/modules/__pycache__/RoPE.cpython-311.pyc differ
diff --git a/modules/__pycache__/block_requests.cpython-311.pyc b/modules/__pycache__/block_requests.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..575fc168c7c267b6e717c9ba683d42c238b2de64
Binary files /dev/null and b/modules/__pycache__/block_requests.cpython-311.pyc differ
diff --git a/modules/__pycache__/callbacks.cpython-311.pyc b/modules/__pycache__/callbacks.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b843a37d64ef556c786331e85ebc4903adbe16d5
Binary files /dev/null and b/modules/__pycache__/callbacks.cpython-311.pyc differ
diff --git a/modules/__pycache__/chat.cpython-311.pyc b/modules/__pycache__/chat.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..baccdd38b33ef44b4777229361be7d1c321fcf38
Binary files /dev/null and b/modules/__pycache__/chat.cpython-311.pyc differ
diff --git a/modules/__pycache__/evaluate.cpython-311.pyc b/modules/__pycache__/evaluate.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd05d10e0ab3b247ce9e693b42aedbce87afa0b0
Binary files /dev/null and b/modules/__pycache__/evaluate.cpython-311.pyc differ
diff --git a/modules/__pycache__/extensions.cpython-311.pyc b/modules/__pycache__/extensions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03a0c6ad8d521981044d23fc771f15666b0d23b4
Binary files /dev/null and b/modules/__pycache__/extensions.cpython-311.pyc differ
diff --git a/modules/__pycache__/github.cpython-311.pyc b/modules/__pycache__/github.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd9600198f7289c398a065d17fc5cbdcdcd1c252
Binary files /dev/null and b/modules/__pycache__/github.cpython-311.pyc differ
diff --git a/modules/__pycache__/html_generator.cpython-311.pyc b/modules/__pycache__/html_generator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3a0f6efa90578f83dd20a5133b98ea616405874
Binary files /dev/null and b/modules/__pycache__/html_generator.cpython-311.pyc differ
diff --git a/modules/__pycache__/llamacpp_hf.cpython-311.pyc b/modules/__pycache__/llamacpp_hf.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..194c65830d3e08fa9af4c48f29165688d637e30c
Binary files /dev/null and b/modules/__pycache__/llamacpp_hf.cpython-311.pyc differ
diff --git a/modules/__pycache__/llamacpp_model.cpython-311.pyc b/modules/__pycache__/llamacpp_model.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59b2e5e5345eceb7bbaa9d802b66a1e43ada7b6d
Binary files /dev/null and b/modules/__pycache__/llamacpp_model.cpython-311.pyc differ
diff --git a/modules/__pycache__/loaders.cpython-311.pyc b/modules/__pycache__/loaders.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3aa8c78ec9a932f652794ff4393db1be1f33774e
Binary files /dev/null and b/modules/__pycache__/loaders.cpython-311.pyc differ
diff --git a/modules/__pycache__/logging_colors.cpython-311.pyc b/modules/__pycache__/logging_colors.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..658ae0189e8e39a61edddf7e35f412b7ae12ea20
Binary files /dev/null and b/modules/__pycache__/logging_colors.cpython-311.pyc differ
diff --git a/modules/__pycache__/logits.cpython-311.pyc b/modules/__pycache__/logits.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c9bc55e36b6e0d3f1fe74967d056a8fb4b7e7c4
Binary files /dev/null and b/modules/__pycache__/logits.cpython-311.pyc differ
diff --git a/modules/__pycache__/metadata_gguf.cpython-311.pyc b/modules/__pycache__/metadata_gguf.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03f29758067bbefe9504b73e31198c28e0218fc9
Binary files /dev/null and b/modules/__pycache__/metadata_gguf.cpython-311.pyc differ
diff --git a/modules/__pycache__/models.cpython-311.pyc b/modules/__pycache__/models.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fecb55bc005f18c22a44122953b46687cdd10cdd
Binary files /dev/null and b/modules/__pycache__/models.cpython-311.pyc differ
diff --git a/modules/__pycache__/models_settings.cpython-311.pyc b/modules/__pycache__/models_settings.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36b418007fc36e4fc9ec1ea85fff38e5a949f4a7
Binary files /dev/null and b/modules/__pycache__/models_settings.cpython-311.pyc differ
diff --git a/modules/__pycache__/one_click_installer_check.cpython-311.pyc b/modules/__pycache__/one_click_installer_check.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d279437b753ad110d410f376665faf0c2451cd5
Binary files /dev/null and b/modules/__pycache__/one_click_installer_check.cpython-311.pyc differ
diff --git a/modules/__pycache__/presets.cpython-311.pyc b/modules/__pycache__/presets.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90776641df6f92707e5f78ab6b791f30b1266f12
Binary files /dev/null and b/modules/__pycache__/presets.cpython-311.pyc differ
diff --git a/modules/__pycache__/prompts.cpython-311.pyc b/modules/__pycache__/prompts.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a433e9722287ab4bed31bd1a0d9cbf3d80a8107
Binary files /dev/null and b/modules/__pycache__/prompts.cpython-311.pyc differ
diff --git a/modules/__pycache__/relative_imports.cpython-311.pyc b/modules/__pycache__/relative_imports.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b58a2593d19d054c506dc568fc4c0b7533476abb
Binary files /dev/null and b/modules/__pycache__/relative_imports.cpython-311.pyc differ
diff --git a/modules/__pycache__/sampler_hijack.cpython-311.pyc b/modules/__pycache__/sampler_hijack.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..155ac8859768ea29c841f623ac27840057f88973
Binary files /dev/null and b/modules/__pycache__/sampler_hijack.cpython-311.pyc differ
diff --git a/modules/__pycache__/shared.cpython-311.pyc b/modules/__pycache__/shared.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c57676784df6109dc5b3dc5786a7ed9e3cf2899
Binary files /dev/null and b/modules/__pycache__/shared.cpython-311.pyc differ
diff --git a/modules/__pycache__/text_generation.cpython-311.pyc b/modules/__pycache__/text_generation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62cea7e18c472461143543cc40379c969c04fe00
Binary files /dev/null and b/modules/__pycache__/text_generation.cpython-311.pyc differ
diff --git a/modules/__pycache__/training.cpython-311.pyc b/modules/__pycache__/training.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0f35e9f67f4c2f4cfa312c958ddf42a0522110c
Binary files /dev/null and b/modules/__pycache__/training.cpython-311.pyc differ
diff --git a/modules/__pycache__/ui.cpython-311.pyc b/modules/__pycache__/ui.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3eb5db0c6514517f6e9222966231c67348993eac
Binary files /dev/null and b/modules/__pycache__/ui.cpython-311.pyc differ
diff --git a/modules/__pycache__/ui_chat.cpython-311.pyc b/modules/__pycache__/ui_chat.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfef4c90ddaadf446107bfc42295344a8a36f6bf
Binary files /dev/null and b/modules/__pycache__/ui_chat.cpython-311.pyc differ
diff --git a/modules/__pycache__/ui_default.cpython-311.pyc b/modules/__pycache__/ui_default.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03df11e414a242ebe6c248e0ae1a4d8a34547c19
Binary files /dev/null and b/modules/__pycache__/ui_default.cpython-311.pyc differ
diff --git a/modules/__pycache__/ui_file_saving.cpython-311.pyc b/modules/__pycache__/ui_file_saving.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..656e7a594e1fc3517770bb0e48509eef38f2b041
Binary files /dev/null and b/modules/__pycache__/ui_file_saving.cpython-311.pyc differ
diff --git a/modules/__pycache__/ui_model_menu.cpython-311.pyc b/modules/__pycache__/ui_model_menu.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85dd199db6a386c04bcdf706ce3de9f3c8a31122
Binary files /dev/null and b/modules/__pycache__/ui_model_menu.cpython-311.pyc differ
diff --git a/modules/__pycache__/ui_notebook.cpython-311.pyc b/modules/__pycache__/ui_notebook.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fe4d63f6400d1243d28b1ec66a8f7d572c08380
Binary files /dev/null and b/modules/__pycache__/ui_notebook.cpython-311.pyc differ
diff --git a/modules/__pycache__/ui_parameters.cpython-311.pyc b/modules/__pycache__/ui_parameters.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5ccc84be5251d3c52dc75e8954d3dcd4a29a1eb
Binary files /dev/null and b/modules/__pycache__/ui_parameters.cpython-311.pyc differ
diff --git a/modules/__pycache__/ui_session.cpython-311.pyc b/modules/__pycache__/ui_session.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6e7bef25ebadf12cb96b9d2438ae99350f199a1
Binary files /dev/null and b/modules/__pycache__/ui_session.cpython-311.pyc differ
diff --git a/modules/__pycache__/utils.cpython-311.pyc b/modules/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c5a3a51a58dbadc5f2287898584d68a0ba92271
Binary files /dev/null and b/modules/__pycache__/utils.cpython-311.pyc differ
diff --git a/modules/block_requests.py b/modules/block_requests.py
new file mode 100644
index 0000000000000000000000000000000000000000..38f1a17fe227682cff5d7f27e4914dfaa12ebf5a
--- /dev/null
+++ b/modules/block_requests.py
@@ -0,0 +1,47 @@
+import builtins
+import io
+
+import requests
+
+from modules.logging_colors import logger
+
+original_open = open
+original_get = requests.get
+
+
+class RequestBlocker:
+
+    def __enter__(self):
+        requests.get = my_get
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        requests.get = original_get
+
+
+class OpenMonkeyPatch:
+
+    def __enter__(self):
+        builtins.open = my_open
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        builtins.open = original_open
+
+
+def my_get(url, **kwargs):
+    logger.info('Unwanted HTTP request redirected to localhost :)')
+    kwargs.setdefault('allow_redirects', True)
+    return requests.api.request('get', 'http://127.0.0.1/', **kwargs)
+
+
+# Kindly provided by our friend WizardLM-30B
+def my_open(*args, **kwargs):
+    filename = str(args[0])
+    if filename.endswith('index.html'):
+        with original_open(*args, **kwargs) as f:
+            file_contents = f.read()
+
+        file_contents = file_contents.replace(b'\t\t<script\n\t\t\tsrc="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.7/iframeResizer.contentWindow.min.js"\n\t\t\tasync\n\t\t></script>', b'')
+        file_contents = file_contents.replace(b'cdnjs.cloudflare.com', b'127.0.0.1')
+        return io.BytesIO(file_contents)
+    else:
+        return original_open(*args, **kwargs)
diff --git a/modules/callbacks.py b/modules/callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b219954f78c58a8733836b4bbfa0cb6518ea560
--- /dev/null
+++ b/modules/callbacks.py
@@ -0,0 +1,103 @@
+import gc
+import traceback
+from queue import Queue
+from threading import Thread
+
+import torch
+import transformers
+from transformers import is_torch_xpu_available
+
+import modules.shared as shared
+
+
+class StopNowException(Exception):
+    pass
+
+
+class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
+    def __init__(self):
+        transformers.StoppingCriteria.__init__(self)
+
+    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
+        return shared.stop_everything
+
+
+class Stream(transformers.StoppingCriteria):
+    def __init__(self, callback_func=None):
+        self.callback_func = callback_func
+
+    def __call__(self, input_ids, scores) -> bool:
+        if self.callback_func is not None:
+            self.callback_func(input_ids[0])
+
+        return False
+
+
+class Iteratorize:
+
+    """
+    Transforms a function that takes a callback
+    into a lazy iterator (generator).
+
+    Adapted from: https://stackoverflow.com/a/9969000
+    """
+
+    def __init__(self, func, args=None, kwargs=None, callback=None):
+        self.mfunc = func
+        self.c_callback = callback
+        self.q = Queue()
+        self.sentinel = object()
+        self.args = args or []
+        self.kwargs = kwargs or {}
+        self.stop_now = False
+
+        def _callback(val):
+            if self.stop_now or shared.stop_everything:
+                raise StopNowException
+            self.q.put(val)
+
+        def gentask():
+            try:
+                ret = self.mfunc(callback=_callback, *args, **self.kwargs)
+            except StopNowException:
+                pass
+            except:
+                traceback.print_exc()
+                pass
+
+            clear_torch_cache()
+            self.q.put(self.sentinel)
+            if self.c_callback:
+                self.c_callback(ret)
+
+        self.thread = Thread(target=gentask)
+        self.thread.start()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        obj = self.q.get(True, None)
+        if obj is self.sentinel:
+            raise StopIteration
+        else:
+            return obj
+
+    def __del__(self):
+        clear_torch_cache()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop_now = True
+        clear_torch_cache()
+
+
+def clear_torch_cache():
+    gc.collect()
+    if not shared.args.cpu:
+        if is_torch_xpu_available():
+            torch.xpu.empty_cache()
+        else:
+            torch.cuda.empty_cache()
diff --git a/modules/chat.py b/modules/chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..bddc31326b8b51515771d136a506be720b86df7a
--- /dev/null
+++ b/modules/chat.py
@@ -0,0 +1,927 @@
+import base64
+import copy
+import functools
+import html
+import json
+import re
+from datetime import datetime
+from functools import partial
+from pathlib import Path
+
+import gradio as gr
+import yaml
+from jinja2.sandbox import ImmutableSandboxedEnvironment
+from PIL import Image
+
+import modules.shared as shared
+from modules import utils
+from modules.extensions import apply_extensions
+from modules.html_generator import chat_html_wrapper, make_thumbnail
+from modules.logging_colors import logger
+from modules.text_generation import (
+    generate_reply,
+    get_encoded_length,
+    get_max_prompt_length
+)
+from modules.utils import delete_file, get_available_characters, save_file
+
+# Copied from the Transformers library
+jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
+
+
+def str_presenter(dumper, data):
+    """
+    Copied from https://github.com/yaml/pyyaml/issues/240
+    Makes pyyaml output prettier multiline strings.
+    """
+
+    if data.count('\n') > 0:
+        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
+
+    return dumper.represent_scalar('tag:yaml.org,2002:str', data)
+
+
+yaml.add_representer(str, str_presenter)
+yaml.representer.SafeRepresenter.add_representer(str, str_presenter)
+
+
+def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=True):
+    '''
+    Given a Jinja template, reverse-engineers the prefix and the suffix for
+    an assistant message (if impersonate=False) or an user message
+    (if impersonate=True)
+    '''
+
+    if impersonate:
+        messages = [
+            {"role": "user", "content": "<<|user-message-1|>>"},
+            {"role": "user", "content": "<<|user-message-2|>>"},
+        ]
+    else:
+        messages = [
+            {"role": "assistant", "content": "<<|user-message-1|>>"},
+            {"role": "assistant", "content": "<<|user-message-2|>>"},
+        ]
+
+    prompt = renderer(messages=messages)
+
+    suffix_plus_prefix = prompt.split("<<|user-message-1|>>")[1].split("<<|user-message-2|>>")[0]
+    suffix = prompt.split("<<|user-message-2|>>")[1]
+    prefix = suffix_plus_prefix[len(suffix):]
+
+    if strip_trailing_spaces:
+        prefix = prefix.rstrip(' ')
+
+    return prefix, suffix
+
+
+def generate_chat_prompt(user_input, state, **kwargs):
+    impersonate = kwargs.get('impersonate', False)
+    _continue = kwargs.get('_continue', False)
+    also_return_rows = kwargs.get('also_return_rows', False)
+    history = kwargs.get('history', state['history'])['internal']
+
+    # Templates
+    chat_template = jinja_env.from_string(state['chat_template_str'])
+    instruction_template = jinja_env.from_string(state['instruction_template_str'])
+    chat_renderer = partial(chat_template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2'])
+    instruct_renderer = partial(instruction_template.render, add_generation_prompt=False)
+
+    messages = []
+
+    if state['mode'] == 'instruct':
+        renderer = instruct_renderer
+        if state['custom_system_message'].strip() != '':
+            messages.append({"role": "system", "content": state['custom_system_message']})
+    else:
+        renderer = chat_renderer
+        if state['context'].strip() != '':
+            context = replace_character_names(state['context'], state['name1'], state['name2'])
+            messages.append({"role": "system", "content": context})
+
+    insert_pos = len(messages)
+    for user_msg, assistant_msg in reversed(history):
+        user_msg = user_msg.strip()
+        assistant_msg = assistant_msg.strip()
+
+        if assistant_msg:
+            messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
+
+        if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
+            messages.insert(insert_pos, {"role": "user", "content": user_msg})
+
+    user_input = user_input.strip()
+    if user_input and not impersonate and not _continue:
+        messages.append({"role": "user", "content": user_input})
+
+    def remove_extra_bos(prompt):
+        for bos_token in ['<s>', '<|startoftext|>']:
+            while prompt.startswith(bos_token):
+                prompt = prompt[len(bos_token):]
+
+        return prompt
+
+    def make_prompt(messages):
+        if state['mode'] == 'chat-instruct' and _continue:
+            prompt = renderer(messages=messages[:-1])
+        else:
+            prompt = renderer(messages=messages)
+
+        if state['mode'] == 'chat-instruct':
+            outer_messages = []
+            if state['custom_system_message'].strip() != '':
+                outer_messages.append({"role": "system", "content": state['custom_system_message']})
+
+            prompt = remove_extra_bos(prompt)
+            command = state['chat-instruct_command']
+            command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1'])
+            command = command.replace('<|prompt|>', prompt)
+
+            if _continue:
+                prefix = get_generation_prompt(renderer, impersonate=impersonate, strip_trailing_spaces=False)[0]
+                prefix += messages[-1]["content"]
+            else:
+                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
+                if not impersonate:
+                    prefix = apply_extensions('bot_prefix', prefix, state)
+
+            outer_messages.append({"role": "user", "content": command})
+            outer_messages.append({"role": "assistant", "content": prefix})
+
+            prompt = instruction_template.render(messages=outer_messages)
+            suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
+            prompt = prompt[:-len(suffix)]
+
+        else:
+            if _continue:
+                suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
+                prompt = prompt[:-len(suffix)]
+            else:
+                prefix = get_generation_prompt(renderer, impersonate=impersonate)[0]
+                if state['mode'] == 'chat' and not impersonate:
+                    prefix = apply_extensions('bot_prefix', prefix, state)
+
+                prompt += prefix
+
+        prompt = remove_extra_bos(prompt)
+        return prompt
+
+    # Handle truncation
+    max_length = get_max_prompt_length(state)
+    prompt = make_prompt(messages)
+    encoded_length = get_encoded_length(prompt)
+
+    while len(messages) > 0 and encoded_length > max_length:
+
+        # Remove old message, save system message
+        if len(messages) > 2 and messages[0]['role'] == 'system':
+            messages.pop(1)
+
+        # Remove old message when no system message is present
+        elif len(messages) > 1 and messages[0]['role'] != 'system':
+            messages.pop(0)
+
+        # Resort to truncating the user input
+        else:
+
+            user_message = messages[-1]['content']
+
+            # Bisect the truncation point
+            left, right = 0, len(user_message) - 1
+
+            while right - left > 1:
+                mid = (left + right) // 2
+
+                messages[-1]['content'] = user_message[mid:]
+                prompt = make_prompt(messages)
+                encoded_length = get_encoded_length(prompt)
+
+                if encoded_length <= max_length:
+                    right = mid
+                else:
+                    left = mid
+
+            messages[-1]['content'] = user_message[right:]
+            prompt = make_prompt(messages)
+            encoded_length = get_encoded_length(prompt)
+            if encoded_length > max_length:
+                logger.error(f"Failed to build the chat prompt. The input is too long for the available context length.\n\nTruncation length: {state['truncation_length']}\nmax_new_tokens: {state['max_new_tokens']} (is it too high?)\nAvailable context length: {max_length}\n")
+                raise ValueError
+            else:
+                logger.warning(f"The input has been truncated. Context length: {state['truncation_length']}, max_new_tokens: {state['max_new_tokens']}, available context length: {max_length}.")
+                break
+
+        prompt = make_prompt(messages)
+        encoded_length = get_encoded_length(prompt)
+
+    if also_return_rows:
+        return prompt, [message['content'] for message in messages]
+    else:
+        return prompt
+
+
+def get_stopping_strings(state):
+    stopping_strings = []
+    renderers = []
+
+    if state['mode'] in ['instruct', 'chat-instruct']:
+        template = jinja_env.from_string(state['instruction_template_str'])
+        renderer = partial(template.render, add_generation_prompt=False)
+        renderers.append(renderer)
+
+    if state['mode'] in ['chat', 'chat-instruct']:
+        template = jinja_env.from_string(state['chat_template_str'])
+        renderer = partial(template.render, add_generation_prompt=False, name1=state['name1'], name2=state['name2'])
+        renderers.append(renderer)
+
+    for renderer in renderers:
+        prefix_bot, suffix_bot = get_generation_prompt(renderer, impersonate=False)
+        prefix_user, suffix_user = get_generation_prompt(renderer, impersonate=True)
+
+        stopping_strings += [
+            suffix_user + prefix_bot,
+            suffix_user + prefix_user,
+            suffix_bot + prefix_bot,
+            suffix_bot + prefix_user,
+        ]
+
+    if 'stopping_strings' in state and isinstance(state['stopping_strings'], list):
+        stopping_strings += state.pop('stopping_strings')
+
+    return list(set(stopping_strings))
+
+
+def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
+    history = state['history']
+    output = copy.deepcopy(history)
+    output = apply_extensions('history', output)
+    state = apply_extensions('state', state)
+
+    visible_text = None
+    stopping_strings = get_stopping_strings(state)
+    is_stream = state['stream']
+
+    # Prepare the input
+    if not (regenerate or _continue):
+        visible_text = html.escape(text)
+
+        # Apply extensions
+        text, visible_text = apply_extensions('chat_input', text, visible_text, state)
+        text = apply_extensions('input', text, state, is_chat=True)
+
+        output['internal'].append([text, ''])
+        output['visible'].append([visible_text, ''])
+
+        # *Is typing...*
+        if loading_message:
+            yield {
+                'visible': output['visible'][:-1] + [[output['visible'][-1][0], shared.processing_message]],
+                'internal': output['internal']
+            }
+    else:
+        text, visible_text = output['internal'][-1][0], output['visible'][-1][0]
+        if regenerate:
+            if loading_message:
+                yield {
+                    'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]],
+                    'internal': output['internal'][:-1] + [[text, '']]
+                }
+        elif _continue:
+            last_reply = [output['internal'][-1][1], output['visible'][-1][1]]
+            if loading_message:
+                yield {
+                    'visible': output['visible'][:-1] + [[visible_text, last_reply[1] + '...']],
+                    'internal': output['internal']
+                }
+
+    if shared.model_name == 'None' or shared.model is None:
+        raise ValueError("No model is loaded! Select one in the Model tab.")
+
+    # Generate the prompt
+    kwargs = {
+        '_continue': _continue,
+        'history': output if _continue else {k: v[:-1] for k, v in output.items()}
+    }
+    prompt = apply_extensions('custom_generate_chat_prompt', text, state, **kwargs)
+    if prompt is None:
+        prompt = generate_chat_prompt(text, state, **kwargs)
+
+    # Generate
+    reply = None
+    for j, reply in enumerate(generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True, for_ui=for_ui)):
+
+        # Extract the reply
+        visible_reply = reply
+        if state['mode'] in ['chat', 'chat-instruct']:
+            visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
+
+        visible_reply = html.escape(visible_reply)
+
+        if shared.stop_everything:
+            output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
+            yield output
+            return
+
+        if _continue:
+            output['internal'][-1] = [text, last_reply[0] + reply]
+            output['visible'][-1] = [visible_text, last_reply[1] + visible_reply]
+            if is_stream:
+                yield output
+        elif not (j == 0 and visible_reply.strip() == ''):
+            output['internal'][-1] = [text, reply.lstrip(' ')]
+            output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')]
+            if is_stream:
+                yield output
+
+    output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
+    yield output
+
+
+def impersonate_wrapper(text, state):
+
+    static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+
+    if shared.model_name == 'None' or shared.model is None:
+        logger.error("No model is loaded! Select one in the Model tab.")
+        yield '', static_output
+        return
+
+    prompt = generate_chat_prompt('', state, impersonate=True)
+    stopping_strings = get_stopping_strings(state)
+
+    yield text + '...', static_output
+    reply = None
+    for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True):
+        yield (text + reply).lstrip(' '), static_output
+        if shared.stop_everything:
+            return
+
+
+def generate_chat_reply(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
+    history = state['history']
+    if regenerate or _continue:
+        text = ''
+        if (len(history['visible']) == 1 and not history['visible'][0][0]) or len(history['internal']) == 0:
+            yield history
+            return
+
+    for history in chatbot_wrapper(text, state, regenerate=regenerate, _continue=_continue, loading_message=loading_message, for_ui=for_ui):
+        yield history
+
+
+def character_is_loaded(state, raise_exception=False):
+    if state['mode'] in ['chat', 'chat-instruct'] and state['name2'] == '':
+        logger.error('It looks like no character is loaded. Please load one under Parameters > Character.')
+        if raise_exception:
+            raise ValueError
+
+        return False
+    else:
+        return True
+
+
+def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
+    '''
+    Same as above but returns HTML for the UI
+    '''
+
+    if not character_is_loaded(state):
+        return
+
+    if state['start_with'] != '' and not _continue:
+        if regenerate:
+            text, state['history'] = remove_last_message(state['history'])
+            regenerate = False
+
+        _continue = True
+        send_dummy_message(text, state)
+        send_dummy_reply(state['start_with'], state)
+
+    for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
+        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
+
+
+def remove_last_message(history):
+    if len(history['visible']) > 0 and history['internal'][-1][0] != '<|BEGIN-VISIBLE-CHAT|>':
+        last = history['visible'].pop()
+        history['internal'].pop()
+    else:
+        last = ['', '']
+
+    return html.unescape(last[0]), history
+
+
+def send_last_reply_to_input(history):
+    if len(history['visible']) > 0:
+        return html.unescape(history['visible'][-1][1])
+    else:
+        return ''
+
+
+def replace_last_reply(text, state):
+    history = state['history']
+
+    if len(text.strip()) == 0:
+        return history
+    elif len(history['visible']) > 0:
+        history['visible'][-1][1] = html.escape(text)
+        history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
+
+    return history
+
+
+def send_dummy_message(text, state):
+    history = state['history']
+    history['visible'].append([html.escape(text), ''])
+    history['internal'].append([apply_extensions('input', text, state, is_chat=True), ''])
+    return history
+
+
+def send_dummy_reply(text, state):
+    history = state['history']
+    if len(history['visible']) > 0 and not history['visible'][-1][1] == '':
+        history['visible'].append(['', ''])
+        history['internal'].append(['', ''])
+
+    history['visible'][-1][1] = html.escape(text)
+    history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
+    return history
+
+
+def redraw_html(history, name1, name2, mode, style, character, reset_cache=False):
+    return chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=reset_cache)
+
+
+def start_new_chat(state):
+    mode = state['mode']
+    history = {'internal': [], 'visible': []}
+
+    if mode != 'instruct':
+        greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
+        if greeting != '':
+            history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
+            history['visible'] += [['', apply_extensions('output', greeting, state, is_chat=True)]]
+
+    unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
+    save_history(history, unique_id, state['character_menu'], state['mode'])
+
+    return history
+
+
+def get_history_file_path(unique_id, character, mode):
+    if mode == 'instruct':
+        p = Path(f'logs/instruct/{unique_id}.json')
+    else:
+        p = Path(f'logs/chat/{character}/{unique_id}.json')
+
+    return p
+
+
+def save_history(history, unique_id, character, mode):
+    if shared.args.multi_user:
+        return
+
+    p = get_history_file_path(unique_id, character, mode)
+    if not p.parent.is_dir():
+        p.parent.mkdir(parents=True)
+
+    with open(p, 'w', encoding='utf-8') as f:
+        f.write(json.dumps(history, indent=4))
+
+
+def rename_history(old_id, new_id, character, mode):
+    if shared.args.multi_user:
+        return
+
+    old_p = get_history_file_path(old_id, character, mode)
+    new_p = get_history_file_path(new_id, character, mode)
+    if new_p.parent != old_p.parent:
+        logger.error(f"The following path is not allowed: {new_p}.")
+    elif new_p == old_p:
+        logger.info("The provided path is identical to the old one.")
+    else:
+        logger.info(f"Renaming {old_p} to {new_p}")
+        old_p.rename(new_p)
+
+
+def find_all_histories(state):
+    if shared.args.multi_user:
+        return ['']
+
+    if state['mode'] == 'instruct':
+        paths = Path('logs/instruct').glob('*.json')
+    else:
+        character = state['character_menu']
+
+        # Handle obsolete filenames and paths
+        old_p = Path(f'logs/{character}_persistent.json')
+        new_p = Path(f'logs/persistent_{character}.json')
+        if old_p.exists():
+            logger.warning(f"Renaming {old_p} to {new_p}")
+            old_p.rename(new_p)
+        if new_p.exists():
+            unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
+            p = get_history_file_path(unique_id, character, state['mode'])
+            logger.warning(f"Moving {new_p} to {p}")
+            p.parent.mkdir(exist_ok=True)
+            new_p.rename(p)
+
+        paths = Path(f'logs/chat/{character}').glob('*.json')
+
+    histories = sorted(paths, key=lambda x: x.stat().st_mtime, reverse=True)
+    histories = [path.stem for path in histories]
+
+    return histories
+
+
+def load_latest_history(state):
+    '''
+    Loads the latest history for the given character in chat or chat-instruct
+    mode, or the latest instruct history for instruct mode.
+    '''
+
+    if shared.args.multi_user:
+        return start_new_chat(state)
+
+    histories = find_all_histories(state)
+
+    if len(histories) > 0:
+        history = load_history(histories[0], state['character_menu'], state['mode'])
+    else:
+        history = start_new_chat(state)
+
+    return history
+
+
+def load_history_after_deletion(state, idx):
+    '''
+    Loads the latest history for the given character in chat or chat-instruct
+    mode, or the latest instruct history for instruct mode.
+    '''
+
+    if shared.args.multi_user:
+        return start_new_chat(state)
+
+    histories = find_all_histories(state)
+    idx = min(int(idx), len(histories) - 1)
+    idx = max(0, idx)
+
+    if len(histories) > 0:
+        history = load_history(histories[idx], state['character_menu'], state['mode'])
+    else:
+        history = start_new_chat(state)
+        histories = find_all_histories(state)
+
+    return history, gr.update(choices=histories, value=histories[idx])
+
+
+def update_character_menu_after_deletion(idx):
+    characters = utils.get_available_characters()
+    idx = min(int(idx), len(characters) - 1)
+    idx = max(0, idx)
+    return gr.update(choices=characters, value=characters[idx])
+
+
+def load_history(unique_id, character, mode):
+    p = get_history_file_path(unique_id, character, mode)
+
+    f = json.loads(open(p, 'rb').read())
+    if 'internal' in f and 'visible' in f:
+        history = f
+    else:
+        history = {
+            'internal': f['data'],
+            'visible': f['data_visible']
+        }
+
+    return history
+
+
+def load_history_json(file, history):
+    try:
+        file = file.decode('utf-8')
+        f = json.loads(file)
+        if 'internal' in f and 'visible' in f:
+            history = f
+        else:
+            history = {
+                'internal': f['data'],
+                'visible': f['data_visible']
+            }
+
+        return history
+    except:
+        return history
+
+
+def delete_history(unique_id, character, mode):
+    p = get_history_file_path(unique_id, character, mode)
+    delete_file(p)
+
+
+def replace_character_names(text, name1, name2):
+    text = text.replace('{{user}}', name1).replace('{{char}}', name2)
+    return text.replace('<USER>', name1).replace('<BOT>', name2)
+
+
+def generate_pfp_cache(character):
+    cache_folder = Path(shared.args.disk_cache_dir)
+    if not cache_folder.exists():
+        cache_folder.mkdir()
+
+    for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
+        if path.exists():
+            original_img = Image.open(path)
+            original_img.save(Path(f'{cache_folder}/pfp_character.png'), format='PNG')
+
+            thumb = make_thumbnail(original_img)
+            thumb.save(Path(f'{cache_folder}/pfp_character_thumb.png'), format='PNG')
+
+            return thumb
+
+    return None
+
+
+def load_character(character, name1, name2):
+    context = greeting = ""
+    greeting_field = 'greeting'
+    picture = None
+
+    filepath = None
+    for extension in ["yml", "yaml", "json"]:
+        filepath = Path(f'characters/{character}.{extension}')
+        if filepath.exists():
+            break
+
+    if filepath is None or not filepath.exists():
+        logger.error(f"Could not find the character \"{character}\" inside characters/. No character has been loaded.")
+        raise ValueError
+
+    file_contents = open(filepath, 'r', encoding='utf-8').read()
+    data = json.loads(file_contents) if extension == "json" else yaml.safe_load(file_contents)
+    cache_folder = Path(shared.args.disk_cache_dir)
+
+    for path in [Path(f"{cache_folder}/pfp_character.png"), Path(f"{cache_folder}/pfp_character_thumb.png")]:
+        if path.exists():
+            path.unlink()
+
+    picture = generate_pfp_cache(character)
+
+    # Finding the bot's name
+    for k in ['name', 'bot', '<|bot|>', 'char_name']:
+        if k in data and data[k] != '':
+            name2 = data[k]
+            break
+
+    # Find the user name (if any)
+    for k in ['your_name', 'user', '<|user|>']:
+        if k in data and data[k] != '':
+            name1 = data[k]
+            break
+
+    if 'context' in data:
+        context = data['context'].strip()
+    elif "char_persona" in data:
+        context = build_pygmalion_style_context(data)
+        greeting_field = 'char_greeting'
+
+    greeting = data.get(greeting_field, greeting)
+    return name1, name2, picture, greeting, context
+
+
+def load_instruction_template(template):
+    for filepath in [Path(f'instruction-templates/{template}.yaml'), Path('instruction-templates/Alpaca.yaml')]:
+        if filepath.exists():
+            break
+    else:
+        return ''
+
+    file_contents = open(filepath, 'r', encoding='utf-8').read()
+    data = yaml.safe_load(file_contents)
+    if 'instruction_template' in data:
+        return data['instruction_template']
+    else:
+        return jinja_template_from_old_format(data)
+
+
+@functools.cache
+def load_character_memoized(character, name1, name2):
+    return load_character(character, name1, name2)
+
+
+@functools.cache
+def load_instruction_template_memoized(template):
+    return load_instruction_template(template)
+
+
+def upload_character(file, img, tavern=False):
+    decoded_file = file if isinstance(file, str) else file.decode('utf-8')
+    try:
+        data = json.loads(decoded_file)
+    except:
+        data = yaml.safe_load(decoded_file)
+
+    if 'char_name' in data:
+        name = data['char_name']
+        greeting = data['char_greeting']
+        context = build_pygmalion_style_context(data)
+        yaml_data = generate_character_yaml(name, greeting, context)
+    else:
+        name = data['name']
+        yaml_data = generate_character_yaml(data['name'], data['greeting'], data['context'])
+
+    outfile_name = name
+    i = 1
+    while Path(f'characters/{outfile_name}.yaml').exists():
+        outfile_name = f'{name}_{i:03d}'
+        i += 1
+
+    with open(Path(f'characters/{outfile_name}.yaml'), 'w', encoding='utf-8') as f:
+        f.write(yaml_data)
+
+    if img is not None:
+        img.save(Path(f'characters/{outfile_name}.png'))
+
+    logger.info(f'New character saved to "characters/{outfile_name}.yaml".')
+    return gr.update(value=outfile_name, choices=get_available_characters())
+
+
+def build_pygmalion_style_context(data):
+    context = ""
+    if 'char_persona' in data and data['char_persona'] != '':
+        context += f"{data['char_name']}'s Persona: {data['char_persona']}\n"
+
+    if 'world_scenario' in data and data['world_scenario'] != '':
+        context += f"Scenario: {data['world_scenario']}\n"
+
+    if 'example_dialogue' in data and data['example_dialogue'] != '':
+        context += f"{data['example_dialogue'].strip()}\n"
+
+    context = f"{context.strip()}\n"
+    return context
+
+
+def upload_tavern_character(img, _json):
+    _json = {'char_name': _json['name'], 'char_persona': _json['description'], 'char_greeting': _json['first_mes'], 'example_dialogue': _json['mes_example'], 'world_scenario': _json['scenario']}
+    return upload_character(json.dumps(_json), img, tavern=True)
+
+
+def check_tavern_character(img):
+    if "chara" not in img.info:
+        return "Not a TavernAI card", None, None, gr.update(interactive=False)
+
+    decoded_string = base64.b64decode(img.info['chara']).replace(b'\\r\\n', b'\\n')
+    _json = json.loads(decoded_string)
+    if "data" in _json:
+        _json = _json["data"]
+
+    return _json['name'], _json['description'], _json, gr.update(interactive=True)
+
+
+def upload_your_profile_picture(img):
+    cache_folder = Path(shared.args.disk_cache_dir)
+    if not cache_folder.exists():
+        cache_folder.mkdir()
+
+    if img is None:
+        if Path(f"{cache_folder}/pfp_me.png").exists():
+            Path(f"{cache_folder}/pfp_me.png").unlink()
+    else:
+        img = make_thumbnail(img)
+        img.save(Path(f'{cache_folder}/pfp_me.png'))
+        logger.info(f'Profile picture saved to "{cache_folder}/pfp_me.png"')
+
+
+def generate_character_yaml(name, greeting, context):
+    data = {
+        'name': name,
+        'greeting': greeting,
+        'context': context,
+    }
+
+    data = {k: v for k, v in data.items() if v}  # Strip falsy
+    return yaml.dump(data, sort_keys=False, width=float("inf"))
+
+
+def generate_instruction_template_yaml(instruction_template):
+    data = {
+        'instruction_template': instruction_template
+    }
+
+    return my_yaml_output(data)
+
+
+def save_character(name, greeting, context, picture, filename):
+    if filename == "":
+        logger.error("The filename is empty, so the character will not be saved.")
+        return
+
+    data = generate_character_yaml(name, greeting, context)
+    filepath = Path(f'characters/{filename}.yaml')
+    save_file(filepath, data)
+    path_to_img = Path(f'characters/{filename}.png')
+    if picture is not None:
+        picture.save(path_to_img)
+        logger.info(f'Saved {path_to_img}.')
+
+
+def delete_character(name, instruct=False):
+    for extension in ["yml", "yaml", "json"]:
+        delete_file(Path(f'characters/{name}.{extension}'))
+
+    delete_file(Path(f'characters/{name}.png'))
+
+
+def jinja_template_from_old_format(params, verbose=False):
+    MASTER_TEMPLATE = """
+{%- set ns = namespace(found=false) -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'system' -%}
+        {%- set ns.found = true -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if not ns.found -%}
+    {{- '<|PRE-SYSTEM|>' + '<|SYSTEM-MESSAGE|>' + '<|POST-SYSTEM|>' -}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' -%}
+        {{- '<|PRE-SYSTEM|>' + message['content'] + '<|POST-SYSTEM|>' -}}
+    {%- else -%}
+        {%- if message['role'] == 'user' -%}
+            {{-'<|PRE-USER|>' + message['content'] + '<|POST-USER|>'-}}
+        {%- else -%}
+            {{-'<|PRE-ASSISTANT|>' + message['content'] + '<|POST-ASSISTANT|>' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{-'<|PRE-ASSISTANT-GENERATE|>'-}}
+{%- endif -%}
+"""
+
+    if 'context' in params and '<|system-message|>' in params['context']:
+        pre_system = params['context'].split('<|system-message|>')[0]
+        post_system = params['context'].split('<|system-message|>')[1]
+    else:
+        pre_system = ''
+        post_system = ''
+
+    pre_user = params['turn_template'].split('<|user-message|>')[0].replace('<|user|>', params['user'])
+    post_user = params['turn_template'].split('<|user-message|>')[1].split('<|bot|>')[0]
+
+    pre_assistant = '<|bot|>' + params['turn_template'].split('<|bot-message|>')[0].split('<|bot|>')[1]
+    pre_assistant = pre_assistant.replace('<|bot|>', params['bot'])
+    post_assistant = params['turn_template'].split('<|bot-message|>')[1]
+
+    def preprocess(string):
+        return string.replace('\n', '\\n').replace('\'', '\\\'')
+
+    pre_system = preprocess(pre_system)
+    post_system = preprocess(post_system)
+    pre_user = preprocess(pre_user)
+    post_user = preprocess(post_user)
+    pre_assistant = preprocess(pre_assistant)
+    post_assistant = preprocess(post_assistant)
+
+    if verbose:
+        print(
+            '\n',
+            repr(pre_system) + '\n',
+            repr(post_system) + '\n',
+            repr(pre_user) + '\n',
+            repr(post_user) + '\n',
+            repr(pre_assistant) + '\n',
+            repr(post_assistant) + '\n',
+        )
+
+    result = MASTER_TEMPLATE
+    if 'system_message' in params:
+        result = result.replace('<|SYSTEM-MESSAGE|>', preprocess(params['system_message']))
+    else:
+        result = result.replace('<|SYSTEM-MESSAGE|>', '')
+
+    result = result.replace('<|PRE-SYSTEM|>', pre_system)
+    result = result.replace('<|POST-SYSTEM|>', post_system)
+    result = result.replace('<|PRE-USER|>', pre_user)
+    result = result.replace('<|POST-USER|>', post_user)
+    result = result.replace('<|PRE-ASSISTANT|>', pre_assistant)
+    result = result.replace('<|PRE-ASSISTANT-GENERATE|>', pre_assistant.rstrip(' '))
+    result = result.replace('<|POST-ASSISTANT|>', post_assistant)
+
+    result = result.strip()
+
+    return result
+
+
+def my_yaml_output(data):
+    '''
+    pyyaml is very inconsistent with multiline strings.
+    for simple instruction template outputs, this is enough.
+    '''
+    result = ""
+    for k in data:
+        result += k + ": |-\n"
+        for line in data[k].splitlines():
+            result += "  " + line.rstrip(' ') + "\n"
+
+    return result
diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..70ce92f54bdb6704ce58f013fd28b0014a5f3b28
--- /dev/null
+++ b/modules/ctransformers_model.py
@@ -0,0 +1,79 @@
+from ctransformers import AutoConfig, AutoModelForCausalLM
+
+from modules import shared
+from modules.callbacks import Iteratorize
+from modules.logging_colors import logger
+
+
+class CtransformersModel:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(cls, path):
+        result = cls()
+
+        config = AutoConfig.from_pretrained(
+            str(path),
+            threads=shared.args.threads if shared.args.threads != 0 else -1,
+            gpu_layers=shared.args.n_gpu_layers,
+            batch_size=shared.args.n_batch,
+            context_length=shared.args.n_ctx,
+            stream=True,
+            mmap=not shared.args.no_mmap,
+            mlock=shared.args.mlock
+        )
+
+        result.model = AutoModelForCausalLM.from_pretrained(
+            str(result.model_dir(path) if result.model_type_is_auto() else path),
+            model_type=(None if result.model_type_is_auto() else shared.args.model_type),
+            config=config
+        )
+
+        logger.info(f'Using ctransformers model_type: {result.model.model_type} for {result.model.model_path}')
+        return result, result
+
+    def model_type_is_auto(self):
+        return shared.args.model_type is None or shared.args.model_type == "Auto" or shared.args.model_type == "None"
+
+    def model_dir(self, path):
+        if path.is_file():
+            return path.parent
+
+        return path
+
+    def encode(self, string, **kwargs):
+        return self.model.tokenize(string)
+
+    def decode(self, ids):
+        return self.model.detokenize(ids)
+
+    def generate(self, prompt, state, callback=None):
+        prompt = prompt if type(prompt) is str else prompt.decode()
+        # ctransformers uses -1 for random seed
+        generator = self.model(
+            prompt=prompt,
+            max_new_tokens=state['max_new_tokens'],
+            temperature=state['temperature'],
+            top_p=state['top_p'],
+            top_k=state['top_k'],
+            repetition_penalty=state['repetition_penalty'],
+            last_n_tokens=state['repetition_penalty_range'],
+            seed=int(state['seed'])
+        )
+
+        output = ""
+        for token in generator:
+            if callback:
+                callback(token)
+
+            output += token
+
+        return output
+
+    def generate_with_streaming(self, *args, **kwargs):
+        with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
+            reply = ''
+            for token in generator:
+                reply += token
+                yield reply
diff --git a/modules/deepspeed_parameters.py b/modules/deepspeed_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..f170a385cfc3dfb954fc6f5595cf8706e42aed30
--- /dev/null
+++ b/modules/deepspeed_parameters.py
@@ -0,0 +1,74 @@
+def generate_ds_config(ds_bf16, train_batch_size, nvme_offload_dir):
+    '''
+    DeepSpeed configuration
+    https://huggingface.co/docs/transformers/main_classes/deepspeed
+    '''
+
+    if nvme_offload_dir:
+        ds_config = {
+            "fp16": {
+                "enabled": not ds_bf16,
+            },
+            "bf16": {
+                "enabled": ds_bf16,
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "offload_param": {
+                    "device": "nvme",
+                    "nvme_path": nvme_offload_dir,
+                    "pin_memory": True,
+                    "buffer_count": 5,
+                    "buffer_size": 1e9,
+                    "max_in_cpu": 1e9
+                },
+                "overlap_comm": True,
+                "reduce_bucket_size": "auto",
+                "contiguous_gradients": True,
+                "sub_group_size": 1e8,
+                "stage3_prefetch_bucket_size": "auto",
+                "stage3_param_persistence_threshold": "auto",
+                "stage3_max_live_parameters": "auto",
+                "stage3_max_reuse_distance": "auto",
+            },
+            "aio": {
+                "block_size": 262144,
+                "queue_depth": 32,
+                "thread_count": 1,
+                "single_submit": False,
+                "overlap_events": True
+            },
+            "steps_per_print": 2000,
+            "train_batch_size": train_batch_size,
+            "train_micro_batch_size_per_gpu": 1,
+            "wall_clock_breakdown": False
+        }
+    else:
+        ds_config = {
+            "fp16": {
+                "enabled": not ds_bf16,
+            },
+            "bf16": {
+                "enabled": ds_bf16,
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "offload_param": {
+                    "device": "cpu",
+                    "pin_memory": True
+                },
+                "overlap_comm": True,
+                "contiguous_gradients": True,
+                "reduce_bucket_size": "auto",
+                "stage3_prefetch_bucket_size": "auto",
+                "stage3_param_persistence_threshold": "auto",
+                "stage3_max_live_parameters": "auto",
+                "stage3_max_reuse_distance": "auto",
+            },
+            "steps_per_print": 2000,
+            "train_batch_size": train_batch_size,
+            "train_micro_batch_size_per_gpu": 1,
+            "wall_clock_breakdown": False
+        }
+
+    return ds_config
diff --git a/modules/evaluate.py b/modules/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..bedafeb649cee788c657047c1ad377e91f39b1ab
--- /dev/null
+++ b/modules/evaluate.py
@@ -0,0 +1,153 @@
+import datetime
+from pathlib import Path
+
+import pandas as pd
+import torch
+from datasets import load_dataset
+from tqdm import tqdm
+
+from modules import shared
+from modules.logging_colors import logger
+from modules.models import clear_torch_cache, load_model, unload_model
+from modules.models_settings import get_model_metadata, update_model_parameters
+from modules.text_generation import encode
+
+
+def load_past_evaluations():
+    if Path('logs/evaluations.csv').exists():
+        df = pd.read_csv(Path('logs/evaluations.csv'), dtype=str)
+        df['Perplexity'] = pd.to_numeric(df['Perplexity'])
+        return df
+    else:
+        return pd.DataFrame(columns=['Model', 'LoRAs', 'Dataset', 'Perplexity', 'stride', 'max_length', 'Date', 'Comment'])
+
+
+past_evaluations = load_past_evaluations()
+
+
+def save_past_evaluations(df):
+    global past_evaluations
+    past_evaluations = df
+    filepath = Path('logs/evaluations.csv')
+    filepath.parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(filepath, index=False)
+
+
+def calculate_perplexity(models, input_dataset, stride, _max_length):
+    '''
+    Based on:
+    https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models
+    '''
+
+    if not shared.args.no_use_fast:
+        logger.warning("--no_use_fast is not being used. If tokenizing the input dataset takes a long time, consider loading the model with that option checked.")
+
+    global past_evaluations
+    cumulative_log = ''
+    cumulative_log += "Loading the input dataset...\n\n"
+    yield cumulative_log
+
+    # Copied from https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/triton/utils/datautils.py
+    if input_dataset == 'wikitext':
+        data = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
+        text = "\n\n".join(data['text'])
+    elif input_dataset == 'ptb':
+        data = load_dataset('ptb_text_only', 'penn_treebank', split='validation')
+        text = "\n\n".join(data['sentence'])
+    elif input_dataset == 'ptb_new':
+        data = load_dataset('ptb_text_only', 'penn_treebank', split='test')
+        text = " ".join(data['sentence'])
+    else:
+        with open(Path(f'training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f:
+            text = f.read()
+
+    for model in models:
+        if is_in_past_evaluations(model, input_dataset, stride, _max_length):
+            cumulative_log += f"`{model}` has already been tested. Ignoring.\n\n"
+            yield cumulative_log
+            continue
+
+        if model != 'current model':
+            try:
+                yield cumulative_log + f"Loading `{model}`...\n\n"
+                model_settings = get_model_metadata(model)
+                shared.settings.update({k: v for k, v in model_settings.items() if k in shared.settings})  # hijacking the interface defaults
+                update_model_parameters(model_settings)  # hijacking the command-line arguments
+                unload_model()
+                shared.model, shared.tokenizer = load_model(model)
+            except:
+                cumulative_log += f"Failed to load `{model}`. Moving on.\n\n"
+                yield cumulative_log
+                continue
+
+        cumulative_log += f"Processing `{shared.model_name}`...\n\n"
+        yield cumulative_log + "Tokenizing the input dataset...\n\n"
+        encodings = encode(text, add_special_tokens=False)
+        seq_len = encodings.shape[1]
+        if _max_length:
+            max_length = _max_length
+        elif hasattr(shared.model.config, 'max_position_embeddings'):
+            max_length = shared.model.config.max_position_embeddings
+        else:
+            max_length = 2048
+
+        nlls = []
+        prev_end_loc = 0
+        for begin_loc in tqdm(range(0, seq_len, stride)):
+            yield cumulative_log + f"Evaluating... {100*begin_loc/seq_len:.2f}%"
+            end_loc = min(begin_loc + max_length, seq_len)
+            trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
+            input_ids = encodings[:, begin_loc:end_loc]
+            target_ids = input_ids.clone()
+            target_ids[:, :-trg_len] = -100
+            clear_torch_cache()
+            with torch.no_grad():
+                outputs = shared.model(input_ids=input_ids, labels=target_ids)
+
+                # loss is calculated using CrossEntropyLoss which averages over valid labels
+                # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
+                # to the left by 1.
+                neg_log_likelihood = outputs.loss
+
+            nlls.append(neg_log_likelihood)
+            prev_end_loc = end_loc
+            if end_loc == seq_len:
+                break
+
+        ppl = torch.exp(torch.stack(nlls).mean())
+        add_entry_to_past_evaluations(float(ppl), shared.model_name, input_dataset, stride, _max_length)
+        save_past_evaluations(past_evaluations)
+        cumulative_log += f"The perplexity for `{shared.model_name}` is: {float(ppl)}\n\n"
+        yield cumulative_log
+
+
+def add_entry_to_past_evaluations(perplexity, model, dataset, stride, max_length):
+    global past_evaluations
+    entry = {
+        'Model': model,
+        'LoRAs': ', '.join(shared.lora_names) or '-',
+        'Dataset': dataset,
+        'Perplexity': perplexity,
+        'stride': str(stride),
+        'max_length': str(max_length),
+        'Date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        'Comment': ''
+    }
+    past_evaluations = pd.concat([past_evaluations, pd.DataFrame([entry])], ignore_index=True)
+
+
+def is_in_past_evaluations(model, dataset, stride, max_length):
+    entries = past_evaluations[(past_evaluations['Model'] == model) &
+                               (past_evaluations['Dataset'] == dataset) &
+                               (past_evaluations['max_length'] == str(max_length)) &
+                               (past_evaluations['stride'] == str(stride))]
+
+    if entries.shape[0] > 0:
+        return True
+    else:
+        return False
+
+
+def generate_markdown_table():
+    sorted_df = past_evaluations.sort_values(by=['Dataset', 'stride', 'Perplexity', 'Date'])
+    return sorted_df
diff --git a/modules/exllamav2.py b/modules/exllamav2.py
new file mode 100644
index 0000000000000000000000000000000000000000..551ed4988c918852f3f35e13dc2eadbe3b5c8455
--- /dev/null
+++ b/modules/exllamav2.py
@@ -0,0 +1,149 @@
+import traceback
+from pathlib import Path
+
+import torch
+from exllamav2 import (
+    ExLlamaV2,
+    ExLlamaV2Cache,
+    ExLlamaV2Cache_8bit,
+    ExLlamaV2Config,
+    ExLlamaV2Tokenizer
+)
+from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
+
+from modules import shared
+from modules.logging_colors import logger
+from modules.text_generation import get_max_prompt_length
+
+try:
+    import flash_attn
+except ModuleNotFoundError:
+    logger.warning(
+        'You are running ExLlamaV2 without flash-attention. This will cause the VRAM usage '
+        'to be a lot higher than it could be.\n'
+        'Try installing flash-attention following the instructions here: '
+        'https://github.com/Dao-AILab/flash-attention#installation-and-features'
+    )
+    pass
+except Exception:
+    logger.warning('Failed to load flash-attention due to the following error:\n')
+    traceback.print_exc()
+
+
+class Exllamav2Model:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(self, path_to_model):
+
+        path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model)
+
+        config = ExLlamaV2Config()
+        config.model_dir = str(path_to_model)
+        config.prepare()
+
+        config.max_seq_len = shared.args.max_seq_len
+        config.scale_pos_emb = shared.args.compress_pos_emb
+        config.scale_alpha_value = shared.args.alpha_value
+        config.no_flash_attn = shared.args.no_flash_attn
+        config.num_experts_per_token = int(shared.args.num_experts_per_token)
+
+        model = ExLlamaV2(config)
+
+        split = None
+        if shared.args.gpu_split:
+            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+
+        model.load(split)
+
+        tokenizer = ExLlamaV2Tokenizer(config)
+        if shared.args.cache_8bit:
+            cache = ExLlamaV2Cache_8bit(model)
+        else:
+            cache = ExLlamaV2Cache(model)
+
+        generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
+
+        result = self()
+        result.model = model
+        result.cache = cache
+        result.tokenizer = tokenizer
+        result.generator = generator
+        result.loras = None
+        return result, result
+
+    def encode(self, string, **kwargs):
+        return self.tokenizer.encode(string, add_bos=True, encode_special_tokens=True)
+
+    def decode(self, ids, **kwargs):
+        if isinstance(ids, list):
+            ids = torch.tensor([ids])
+        elif isinstance(ids, torch.Tensor) and ids.numel() == 1:
+            ids = ids.view(1, -1)
+
+        return self.tokenizer.decode(ids, decode_special_tokens=True)[0]
+
+    def get_logits(self, token_ids, **kwargs):
+        self.cache.current_seq_len = 0
+        if token_ids.shape[-1] > 1:
+            self.model.forward(token_ids[:, :-1], self.cache, input_mask=None, preprocess_only=True, loras=self.loras)
+
+        return self.model.forward(token_ids[:, -1:], self.cache, input_mask=None, loras=self.loras, **kwargs).float().cpu()
+
+    def generate_with_streaming(self, prompt, state):
+        settings = ExLlamaV2Sampler.Settings()
+
+        settings.token_repetition_penalty = state['repetition_penalty']
+        settings.token_repetition_range = -1 if state['repetition_penalty_range'] <= 0 else state['repetition_penalty_range']
+
+        settings.token_frequency_penalty = state['frequency_penalty']
+        settings.token_presence_penalty = state['presence_penalty']
+
+        settings.temperature = state['temperature']
+        settings.top_k = state['top_k']
+        settings.top_p = state['top_p']
+        settings.top_a = state['top_a']
+        settings.min_p = state['min_p']
+        settings.tfs = state['tfs']
+        settings.typical = state['typical_p']
+
+        settings.temperature_last = state['temperature_last']
+
+        settings.mirostat = state['mirostat_mode'] == 2
+        settings.mirostat_tau = state['mirostat_tau']
+        settings.mirostat_eta = state['mirostat_eta']
+
+        if state['ban_eos_token']:
+            settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id])
+
+        if state['custom_token_bans']:
+            to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
+            if len(to_ban) > 0:
+                settings.disallow_tokens(self.tokenizer, to_ban)
+
+        ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True)
+        ids = ids[:, -get_max_prompt_length(state):]
+
+        if state['auto_max_new_tokens']:
+            max_new_tokens = state['truncation_length'] - ids.shape[-1]
+        else:
+            max_new_tokens = state['max_new_tokens']
+
+        self.generator.begin_stream(ids, settings, loras=self.loras)
+
+        decoded_text = ''
+        for i in range(max_new_tokens):
+            chunk, eos, _ = self.generator.stream()
+            if eos or shared.stop_everything:
+                break
+
+            decoded_text += chunk
+            yield decoded_text
+
+    def generate(self, prompt, state):
+        output = ''
+        for output in self.generate_with_streaming(prompt, state):
+            pass
+
+        return output
diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..944c39dd29557f8304c628c67be18182a829912f
--- /dev/null
+++ b/modules/exllamav2_hf.py
@@ -0,0 +1,170 @@
+import os
+import traceback
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import torch
+from exllamav2 import (
+    ExLlamaV2,
+    ExLlamaV2Cache,
+    ExLlamaV2Cache_8bit,
+    ExLlamaV2Config
+)
+from torch.nn import CrossEntropyLoss
+from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from modules import shared
+from modules.logging_colors import logger
+
+try:
+    import flash_attn
+except ModuleNotFoundError:
+    logger.warning(
+        'You are running ExLlamaV2 without flash-attention. This will cause the VRAM usage '
+        'to be a lot higher than it could be.\n'
+        'Try installing flash-attention following the instructions here: '
+        'https://github.com/Dao-AILab/flash-attention#installation-and-features'
+    )
+    pass
+except Exception:
+    logger.warning('Failed to load flash-attention due to the following error:\n')
+    traceback.print_exc()
+
+
+class Exllamav2HF(PreTrainedModel):
+    def __init__(self, config: ExLlamaV2Config):
+        super().__init__(PretrainedConfig())
+        self.ex_config = config
+        self.ex_model = ExLlamaV2(config)
+        split = None
+        if shared.args.gpu_split:
+            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]
+
+        self.ex_model.load(split)
+        self.generation_config = GenerationConfig()
+        self.loras = None
+
+        if shared.args.cache_8bit:
+            self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model)
+        else:
+            self.ex_cache = ExLlamaV2Cache(self.ex_model)
+
+        self.past_seq = None
+        if shared.args.cfg_cache:
+            if shared.args.cache_8bit:
+                self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model)
+            else:
+                self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)
+
+            self.past_seq_negative = None
+
+    def _validate_model_class(self):
+        pass
+
+    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
+        pass
+
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {'input_ids': input_ids, **kwargs}
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(0)
+
+    def __call__(self, *args, **kwargs):
+        use_cache = kwargs.get('use_cache', True)
+        labels = kwargs.get('labels', None)
+        past_key_values = kwargs.get('past_key_values', None)
+
+        if len(args) > 0:
+            if not shared.args.cfg_cache:
+                logger.error("Please enable the cfg-cache option to use CFG with ExLlamav2_HF.")
+                return
+
+            input_ids = args[0]
+            is_negative = True
+            past_seq = self.past_seq_negative
+            ex_cache = self.ex_cache_negative
+        else:
+            input_ids = kwargs['input_ids']
+            is_negative = False
+            past_seq = self.past_seq
+            ex_cache = self.ex_cache
+
+        seq = input_ids[0].tolist()
+        if is_negative and past_key_values is not None:
+            seq = past_key_values + seq
+
+        seq_tensor = torch.tensor(seq)
+        reset = True
+
+        # Make the forward call
+        if labels is None:
+            if past_seq is not None:
+                min_length = min(past_seq.shape[0], seq_tensor.shape[0])
+                indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
+                if len(indices) > 0:
+                    longest_prefix = indices[0].item()
+                else:
+                    longest_prefix = min_length
+
+                if longest_prefix > 0:
+                    reset = False
+                    ex_cache.current_seq_len = longest_prefix
+                    if len(seq_tensor) - longest_prefix > 1:
+                        self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)
+                    elif len(seq_tensor) == longest_prefix:
+                        # Very tricky: if the prefix we are reusing *is* the input_ids, then we have to back up the cache pointer by one,
+                        # because we feed input_ids[-1] to forward() below, but that last token is already in the cache!
+                        ex_cache.current_seq_len -= 1
+
+            if reset:
+                ex_cache.current_seq_len = 0
+                if len(seq_tensor) > 1:
+                    self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)
+
+            logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache, loras=self.loras).to(input_ids.device).float()
+        else:
+            ex_cache.current_seq_len = 0
+            logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False, loras=self.loras).float()
+
+        if is_negative:
+            self.past_seq_negative = seq_tensor
+        else:
+            self.past_seq = seq_tensor
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, logits.shape[-1])
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
+        assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
+        if isinstance(pretrained_model_name_or_path, str):
+            pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
+
+        pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
+
+        config = ExLlamaV2Config()
+        config.model_dir = str(pretrained_model_name_or_path)
+        config.prepare()
+
+        config.max_seq_len = shared.args.max_seq_len
+        config.scale_pos_emb = shared.args.compress_pos_emb
+        config.scale_alpha_value = shared.args.alpha_value
+        config.no_flash_attn = shared.args.no_flash_attn
+        config.num_experts_per_token = int(shared.args.num_experts_per_token)
+
+        return Exllamav2HF(config)
diff --git a/modules/extensions.py b/modules/extensions.py
new file mode 100644
index 0000000000000000000000000000000000000000..0af5e5fe0d97e6e62f49c50d73478c0c7cc5f625
--- /dev/null
+++ b/modules/extensions.py
@@ -0,0 +1,232 @@
+import traceback
+from functools import partial
+from inspect import signature
+
+import gradio as gr
+
+import extensions
+import modules.shared as shared
+from modules.logging_colors import logger
+
+state = {}
+available_extensions = []
+setup_called = set()
+
+
+def apply_settings(extension, name):
+    if not hasattr(extension, 'params'):
+        return
+
+    for param in extension.params:
+        _id = f"{name}-{param}"
+        shared.default_settings[_id] = extension.params[param]
+        if _id in shared.settings:
+            extension.params[param] = shared.settings[_id]
+
+
+def load_extensions():
+    global state, setup_called
+    state = {}
+    for i, name in enumerate(shared.args.extensions):
+        if name in available_extensions:
+            if name != 'api':
+                logger.info(f'Loading the extension "{name}"')
+            try:
+                try:
+                    exec(f"import extensions.{name}.script")
+                except ModuleNotFoundError:
+                    logger.error(f"Could not import the requirements for '{name}'. Make sure to install the requirements for the extension.\n\nLinux / Mac:\n\npip install -r extensions/{name}/requirements.txt --upgrade\n\nWindows:\n\npip install -r extensions\\{name}\\requirements.txt --upgrade\n\nIf you used the one-click installer, paste the command above in the terminal window opened after launching the cmd script for your OS.")
+                    raise
+
+                extension = getattr(extensions, name).script
+
+                # Only run setup() and apply settings from settings.yaml once
+                if extension not in setup_called:
+                    apply_settings(extension, name)
+                    if hasattr(extension, "setup"):
+                        extension.setup()
+
+                    setup_called.add(extension)
+
+                state[name] = [True, i]
+            except:
+                logger.error(f'Failed to load the extension "{name}".')
+                traceback.print_exc()
+
+
+# This iterator returns the extensions in the order specified in the command-line
+def iterator():
+    for name in sorted(state, key=lambda x: state[x][1]):
+        if state[name][0]:
+            yield getattr(extensions, name).script, name
+
+
+# Extension functions that map string -> string
+def _apply_string_extensions(function_name, text, state, is_chat=False):
+    for extension, _ in iterator():
+        if hasattr(extension, function_name):
+            func = getattr(extension, function_name)
+
+            # Handle old extensions without the 'state' arg or
+            # the 'is_chat' kwarg
+            count = 0
+            has_chat = False
+            for k in signature(func).parameters:
+                if k == 'is_chat':
+                    has_chat = True
+                else:
+                    count += 1
+
+            if count == 2:
+                args = [text, state]
+            else:
+                args = [text]
+
+            if has_chat:
+                kwargs = {'is_chat': is_chat}
+            else:
+                kwargs = {}
+
+            text = func(*args, **kwargs)
+
+    return text
+
+
+# Extension functions that map string -> string
+def _apply_chat_input_extensions(text, visible_text, state):
+    for extension, _ in iterator():
+        if hasattr(extension, 'chat_input_modifier'):
+            text, visible_text = extension.chat_input_modifier(text, visible_text, state)
+
+    return text, visible_text
+
+
+# custom_generate_chat_prompt handling - currently only the first one will work
+def _apply_custom_generate_chat_prompt(text, state, **kwargs):
+    for extension, _ in iterator():
+        if hasattr(extension, 'custom_generate_chat_prompt'):
+            return extension.custom_generate_chat_prompt(text, state, **kwargs)
+
+    return None
+
+
+# Extension that modifies the input parameters before they are used
+def _apply_state_modifier_extensions(state):
+    for extension, _ in iterator():
+        if hasattr(extension, "state_modifier"):
+            state = getattr(extension, "state_modifier")(state)
+
+    return state
+
+
+# Extension that modifies the chat history before it is used
+def _apply_history_modifier_extensions(history):
+    for extension, _ in iterator():
+        if hasattr(extension, "history_modifier"):
+            history = getattr(extension, "history_modifier")(history)
+
+    return history
+
+
+# Extension functions that override the default tokenizer output - The order of execution is not defined
+def _apply_tokenizer_extensions(function_name, state, prompt, input_ids, input_embeds):
+    for extension, _ in iterator():
+        if hasattr(extension, function_name):
+            prompt, input_ids, input_embeds = getattr(extension, function_name)(state, prompt, input_ids, input_embeds)
+
+    return prompt, input_ids, input_embeds
+
+
+# Allow extensions to add their own logits processors to the stack being run.
+# Each extension would call `processor_list.append({their LogitsProcessor}())`.
+def _apply_logits_processor_extensions(function_name, processor_list, input_ids):
+    for extension, _ in iterator():
+        if hasattr(extension, function_name):
+            result = getattr(extension, function_name)(processor_list, input_ids)
+            if type(result) is list:
+                processor_list = result
+
+    return processor_list
+
+
+# Get prompt length in tokens after applying extension functions which override the default tokenizer output
+# currently only the first one will work
+def _apply_custom_tokenized_length(prompt):
+    for extension, _ in iterator():
+        if hasattr(extension, 'custom_tokenized_length'):
+            return getattr(extension, 'custom_tokenized_length')(prompt)
+
+    return None
+
+
+# Custom generate reply handling - currently only the first one will work
+def _apply_custom_generate_reply():
+    for extension, _ in iterator():
+        if hasattr(extension, 'custom_generate_reply'):
+            return getattr(extension, 'custom_generate_reply')
+
+    return None
+
+
+def _apply_custom_css():
+    all_css = ''
+    for extension, _ in iterator():
+        if hasattr(extension, 'custom_css'):
+            all_css += getattr(extension, 'custom_css')()
+
+    return all_css
+
+
+def _apply_custom_js():
+    all_js = ''
+    for extension, _ in iterator():
+        if hasattr(extension, 'custom_js'):
+            all_js += getattr(extension, 'custom_js')()
+
+    return all_js
+
+
+def create_extensions_block():
+    to_display = []
+    for extension, name in iterator():
+        if hasattr(extension, "ui") and not (hasattr(extension, 'params') and extension.params.get('is_tab', False)):
+            to_display.append((extension, name))
+
+    # Creating the extension ui elements
+    if len(to_display) > 0:
+        with gr.Column(elem_id="extensions"):
+            for row in to_display:
+                extension, _ = row
+                extension.ui()
+
+
+def create_extensions_tabs():
+    for extension, name in iterator():
+        if hasattr(extension, "ui") and (hasattr(extension, 'params') and extension.params.get('is_tab', False)):
+            display_name = getattr(extension, 'params', {}).get('display_name', name)
+            with gr.Tab(display_name, elem_classes="extension-tab"):
+                extension.ui()
+
+
+EXTENSION_MAP = {
+    "input": partial(_apply_string_extensions, "input_modifier"),
+    "output": partial(_apply_string_extensions, "output_modifier"),
+    "chat_input": _apply_chat_input_extensions,
+    "state": _apply_state_modifier_extensions,
+    "history": _apply_history_modifier_extensions,
+    "bot_prefix": partial(_apply_string_extensions, "bot_prefix_modifier"),
+    "tokenizer": partial(_apply_tokenizer_extensions, "tokenizer_modifier"),
+    'logits_processor': partial(_apply_logits_processor_extensions, 'logits_processor_modifier'),
+    "custom_generate_chat_prompt": _apply_custom_generate_chat_prompt,
+    "custom_generate_reply": _apply_custom_generate_reply,
+    "tokenized_length": _apply_custom_tokenized_length,
+    "css": _apply_custom_css,
+    "js": _apply_custom_js
+}
+
+
+def apply_extensions(typ, *args, **kwargs):
+    if typ not in EXTENSION_MAP:
+        raise ValueError(f"Invalid extension type {typ}")
+
+    return EXTENSION_MAP[typ](*args, **kwargs)
diff --git a/modules/github.py b/modules/github.py
new file mode 100644
index 0000000000000000000000000000000000000000..282267b6be7f3b0371a3fd332f98e38611c9fb9a
--- /dev/null
+++ b/modules/github.py
@@ -0,0 +1,38 @@
+import subprocess
+from pathlib import Path
+
+new_extensions = set()
+
+
+def clone_or_pull_repository(github_url):
+    global new_extensions
+
+    repository_folder = Path("extensions")
+    repo_name = github_url.rstrip("/").split("/")[-1].split(".")[0]
+
+    # Check if the repository folder exists
+    if not repository_folder.exists():
+        repository_folder.mkdir(parents=True)
+
+    repo_path = repository_folder / repo_name
+
+    # Check if the repository is already cloned
+    if repo_path.exists():
+        yield f"Updating {github_url}..."
+        # Perform a 'git pull' to update the repository
+        try:
+            pull_output = subprocess.check_output(["git", "-C", repo_path, "pull"], stderr=subprocess.STDOUT)
+            yield "Done."
+            return pull_output.decode()
+        except subprocess.CalledProcessError as e:
+            return str(e)
+
+    # Clone the repository
+    try:
+        yield f"Cloning {github_url}..."
+        clone_output = subprocess.check_output(["git", "clone", github_url, repo_path], stderr=subprocess.STDOUT)
+        new_extensions.add(repo_name)
+        yield f"The extension `{repo_name}` has been downloaded.\n\nPlease close the the web UI completely and launch it again to be able to load it."
+        return clone_output.decode()
+    except subprocess.CalledProcessError as e:
+        return str(e)
diff --git a/modules/grammar/__pycache__/grammar_utils.cpython-311.pyc b/modules/grammar/__pycache__/grammar_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..787f3f9edaba19fe8fb7af1d5c19ee38ff3964ff
Binary files /dev/null and b/modules/grammar/__pycache__/grammar_utils.cpython-311.pyc differ
diff --git a/modules/grammar/__pycache__/logits_process.cpython-311.pyc b/modules/grammar/__pycache__/logits_process.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba55bccc85fd271119dc65ec55bd89ea31a7120a
Binary files /dev/null and b/modules/grammar/__pycache__/logits_process.cpython-311.pyc differ
diff --git a/modules/grammar/grammar_utils.py b/modules/grammar/grammar_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b37c24a7ac1e3b41a91585c956b17b6e07c6328
--- /dev/null
+++ b/modules/grammar/grammar_utils.py
@@ -0,0 +1,687 @@
+'''
+This file has been 100% copied from this PR to the Transformers library:
+https://github.com/huggingface/transformers/pull/27557
+
+Author: Saibo-creator
+Author GitHub: https://github.com/Saibo-creator
+
+All credits go to the author.
+'''
+
+import logging
+import re
+import time
+from abc import ABC
+from functools import lru_cache
+from typing import Dict, List
+
+import torch
+
+from modules import shared
+
+logger = logging.getLogger(__name__)
+
+
+########################
+# EBNF Grammar Parsing #
+########################
+
+END_OF_ALTERNATE_MARKER = 0
+END_OF_RULE_MARKER = 0
+TO_BE_FILLED_MARKER = 0
+REF_RULE_MARKER = 1
+LITERAL_MARKER = 2
+
+
+class ParseState:
+    def __init__(self):
+        self.symbol_ids = {}
+        self.grammar_encoding = []  # old name: out_grammar
+
+
+def get_symbol_id(state, src):
+    if src not in state.symbol_ids:
+        state.symbol_ids[src] = len(state.symbol_ids)
+    return state.symbol_ids[src]
+
+
+def generate_symbol_id(state, base_name):
+    next_id = len(state.symbol_ids)
+    state.symbol_ids[base_name + "_" + str(next_id)] = next_id
+    return next_id
+
+
+def is_word_char(c):
+    return c.isalnum() or c == "-" or c == "_"
+
+
+def hex_to_int(c):
+    if c.isdigit():
+        return int(c)
+    elif "a" <= c.lower() <= "f":
+        return ord(c.lower()) - ord("a") + 10
+    return -1
+
+
+def remove_leading_white_space(src, newline_ok):
+    """
+    Skips over whitespace and comments in the input string.
+    This function processes the input string, skipping over any spaces, tabs,
+    and content following a '#' character, which denotes a comment. The parsing
+    of a comment continues until the end of the line (denoted by newline characters
+    '\r' or '\n'). If the 'newline_ok' parameter is set to False, the function
+    will stop processing and return the remaining string upon encountering a
+    newline character, otherwise it will skip over newline characters as well.
+    Parameters:
+    src (str): The input string to be processed.
+    newline_ok (bool): A flag indicating whether encountering a newline character
+                       should stop the parsing (False) or if it should be skipped (True).
+    Returns:
+    str: The remaining portion of the input string after skipping whitespace and comments.
+    """
+    pos = 0
+    while pos < len(src) and (src[pos].isspace() or src[pos] == "#"):
+        if src[pos] == "#":
+            while pos < len(src) and src[pos] not in ("\r", "\n"):
+                pos += 1
+        else:
+            if not newline_ok and src[pos] in ("\r", "\n"):
+                break
+            pos += 1
+    return src[pos:]
+
+
+def parse_name(src):
+    pos = 0
+    while pos < len(src) and is_word_char(src[pos]):
+        pos += 1
+    if pos == 0:
+        raise RuntimeError("expecting name at " + src)
+    return src[:pos], src[pos:]
+
+
+def parse_char(src):
+    """
+    parse the leading char from the input string
+    :param src:
+    :return: char, remaining_src
+    """
+
+    # if we have a backslash, it's maybe an escape
+    if src[0] == "\\":
+        esc = src[1]
+        if esc == "x":
+            first = hex_to_int(src[2])
+            if first > -1:
+                second = hex_to_int(src[3])
+                if second > -1:
+                    return (first << 4) + second, src[4:]
+            raise RuntimeError("expecting \\xNN at " + src)
+        elif esc in ('"', "[", "]"):
+            return esc, src[2:]
+        elif esc == "r":
+            return "\r", src[2:]
+        elif esc == "n":
+            return "\n", src[2:]
+        elif esc == "t":
+            return "\t", src[2:]
+        raise RuntimeError("unknown escape at " + src)
+    elif src:
+        return src[0], src[1:]
+    raise RuntimeError("unexpected end of input")
+
+
+def parse_sequence(state, src, rule_name, outbuf, is_nested):
+    out_start_pos = len(outbuf)
+
+    # sequence size, will be replaced at end when known
+    outbuf.append(TO_BE_FILLED_MARKER)
+
+    last_sym_start = len(outbuf)
+    remaining_src = src
+    while remaining_src:
+        if remaining_src[0] == '"':  # literal string
+            remaining_src = remaining_src[1:]
+            last_sym_start = len(outbuf)
+            while remaining_src[0] != '"':
+                char, remaining_src = parse_char(remaining_src)
+
+                # each char of a literal is encoded as a "range" of char - char
+                outbuf.append(LITERAL_MARKER)
+                outbuf.append(ord(char))
+                outbuf.append(ord(char))
+            remaining_src = remove_leading_white_space(remaining_src[1:], is_nested)
+        elif remaining_src[0] == "[":  # char range(s)
+            remaining_src = remaining_src[1:]
+            last_sym_start = len(outbuf)
+            # num chars in range - replaced at end of loop
+            outbuf.append(TO_BE_FILLED_MARKER)
+            while remaining_src[0] != "]":
+                char, remaining_src = parse_char(remaining_src)
+
+                outbuf.append(ord(char))
+                if remaining_src[0] == "-" and remaining_src[1] != "]":
+                    endchar_pair, remaining_src = parse_char(remaining_src[1:])
+                    outbuf.append(ord(endchar_pair))
+                else:
+                    # chars that aren't part of a c1-c2 range are just doubled (i.e., c-c)
+                    outbuf.append(ord(char))
+            # replace num chars with actual
+            outbuf[last_sym_start] = len(outbuf) - last_sym_start - 1
+            remaining_src = remove_leading_white_space(remaining_src[1:], is_nested)
+        elif is_word_char(remaining_src[0]):  # rule reference
+            name, remaining_src = parse_name(remaining_src)
+            ref_rule_id = get_symbol_id(state, name)
+            remaining_src = remove_leading_white_space(remaining_src, is_nested)
+            last_sym_start = len(outbuf)
+            outbuf.append(REF_RULE_MARKER)
+            outbuf.append(ref_rule_id)
+        elif remaining_src[0] == "(":  # grouping
+            # parse nested alternates into synthesized rule
+            remaining_src = remove_leading_white_space(remaining_src[1:], True)
+            sub_rule_id = generate_symbol_id(state, rule_name)
+            remaining_src = parse_alternates(state, remaining_src, rule_name, sub_rule_id, True)
+            last_sym_start = len(outbuf)
+            # output reference to synthesized rule
+            outbuf.append(REF_RULE_MARKER)
+            outbuf.append(sub_rule_id)
+            if remaining_src[0] != ")":
+                raise RuntimeError("expecting ')' at " + remaining_src)
+            remaining_src = remove_leading_white_space(remaining_src[1:], is_nested)
+        elif remaining_src[0] in ("*", "+", "?"):  # repetition operator
+            if len(outbuf) - out_start_pos - 1 == 0:
+                raise RuntimeError("expecting preceeding item to */+/? at " + remaining_src)
+            out_grammar = state.grammar_encoding
+
+            # apply transformation to previous symbol (last_sym_start -
+            # end) according to rewrite rules:
+            # S* --> S' ::= S S' |
+            # S+ --> S' ::= S S' | S
+            # S? --> S' ::= S |
+            sub_rule_id = generate_symbol_id(state, rule_name)
+            out_grammar.append(sub_rule_id)
+            sub_rule_start = len(out_grammar)
+            # placeholder for size of 1st alternate
+            out_grammar.append(TO_BE_FILLED_MARKER)
+            # add preceding symbol to generated rule
+            out_grammar.extend(outbuf[last_sym_start:])
+            if remaining_src[0] in ("*", "+"):
+                # cause generated rule to recurse
+                out_grammar.append(REF_RULE_MARKER)
+                out_grammar.append(sub_rule_id)
+            # apply actual size
+            out_grammar[sub_rule_start] = len(out_grammar) - sub_rule_start
+            # mark end of 1st alternate
+            out_grammar.append(END_OF_ALTERNATE_MARKER)
+            sub_rule_start = len(out_grammar)
+            # placeholder for size of 2nd alternate
+            out_grammar.append(TO_BE_FILLED_MARKER)
+            if remaining_src[0] == "+":
+                # add preceding symbol as alternate only for '+'
+                out_grammar.extend(outbuf[last_sym_start:])
+            # apply actual size of 2nd alternate
+            out_grammar[sub_rule_start] = len(out_grammar) - sub_rule_start
+            # mark end of 2nd alternate, then end of rule
+            out_grammar.append(END_OF_ALTERNATE_MARKER)
+            out_grammar.append(END_OF_RULE_MARKER)
+
+            # in original rule, replace previous symbol with reference to generated rule
+            outbuf[last_sym_start:] = [1, sub_rule_id]
+
+            remaining_src = remove_leading_white_space(remaining_src[1:], is_nested)
+        else:
+            break
+    # apply actual size of this alternate sequence
+    outbuf[out_start_pos] = len(outbuf) - out_start_pos
+    # mark end of alternate
+    outbuf.append(END_OF_ALTERNATE_MARKER)
+    return remaining_src
+
+
+def parse_alternates(state, src, rule_name, rule_id, is_nested):
+    outbuf = []
+    remaining_src = parse_sequence(state, src, rule_name, outbuf, is_nested)
+    while remaining_src and remaining_src[0] == "|":
+        remaining_src = remove_leading_white_space(remaining_src[1:], True)
+        remaining_src = parse_sequence(state, remaining_src, rule_name, outbuf, is_nested)
+
+    state.grammar_encoding.append(rule_id)
+    state.grammar_encoding.extend(outbuf)
+    state.grammar_encoding.append(0)
+    return remaining_src
+
+
+def parse_rule(state, src):
+    name, remaining_src = parse_name(src)
+    remaining_src = remove_leading_white_space(remaining_src, False)
+    rule_id = get_symbol_id(state, name)
+
+    if remaining_src[:3] != "::=":
+        raise RuntimeError("expecting ::= at " + remaining_src)
+    remaining_src = remove_leading_white_space(remaining_src[3:], True)
+
+    remaining_src = parse_alternates(state, remaining_src, name, rule_id, False)
+
+    if remaining_src and remaining_src[0] == "\r":
+        remaining_src = remaining_src[2:] if remaining_src[1] == "\n" else remaining_src[1:]
+    elif remaining_src and remaining_src[0] == "\n":
+        remaining_src = remaining_src[1:]
+    elif remaining_src:
+        raise RuntimeError("expecting newline or end at " + remaining_src)
+    return remove_leading_white_space(remaining_src, True)
+
+
+def parse_ebnf(src):
+    try:
+        state = ParseState()
+        grammar_repr = remove_leading_white_space(src, True)
+        last_grammar_repr = ""
+        while grammar_repr:
+            if last_grammar_repr:
+                last_parsed_rule_len = len(last_grammar_repr) - len(grammar_repr)
+                logger.debug(f"last_parsed_rule: {last_grammar_repr[:last_parsed_rule_len]}")
+            last_grammar_repr = grammar_repr
+            grammar_repr = parse_rule(state, grammar_repr)
+        state.grammar_encoding.append(0xFFFF)
+        return state
+    except RuntimeError as err:
+        logger.warning("error parsing grammar:", err)
+        return ParseState()
+
+
+def print_rule(file, grammar_encoding, index, symbol_id_names):
+    rule_id = grammar_encoding[index]
+    print(f"<{index}>{symbol_id_names[rule_id]} ::=", end=" ", file=file)
+    pos = index + 1
+    while grammar_encoding[pos]:
+        if pos - 1 > index:
+            print("|", end=" ", file=file)
+        pos += 1  # sequence size, not needed here
+        while grammar_encoding[pos]:
+            if grammar_encoding[pos] == REF_RULE_MARKER:
+                ref_rule_id = grammar_encoding[pos + 1]
+                print(
+                    f"<{pos}>{symbol_id_names[ref_rule_id]}",
+                    end=" ",
+                    file=file,
+                )
+                pos += 2
+            else:
+                print("<{}>[".format(pos), end="", file=file)
+                num_chars = grammar_encoding[pos]
+                pos += 1
+
+                for i in range(0, num_chars, 2):
+                    print("{}-".format(chr(grammar_encoding[pos + i])), end="", file=file)
+                    if i + 1 < num_chars:
+                        print("{}".format(chr(grammar_encoding[pos + i + 1])), end="", file=file)
+                print("]", end=" ", file=file)
+                pos += num_chars
+        pos += 1
+    print(file=file)
+    return pos + 1
+
+
+def print_grammar(file, state):
+    pos = 0
+    symbol_id_names = {v: k for k, v in state.symbol_ids.items()}
+    print("Grammar Rules:", file=file)
+
+    while state.grammar_encoding[pos] != 0xFFFF:
+        pos = print_rule(file, state.grammar_encoding, pos, symbol_id_names)
+    pos = 0
+    print("\nBinary representation:", file=file)
+    while state.grammar_encoding[pos] != 0xFFFF:
+        print(f"{state.grammar_encoding[pos]:04x}", end=" ", file=file)
+        pos += 1
+    print("ffff\n")
+
+
+###################################
+# EBNF Grammar Parsing ends here  #
+###################################
+
+
+class GrammarConstraint(ABC):
+    def __init__(self, grammar_str, start_rule_name, tokenizer):
+        self.tt = 0
+        self.nt = 0
+        state = parse_ebnf(grammar_str)
+        grammar_encoding = state.grammar_encoding
+        self.start_rule_id = state.symbol_ids.get(start_rule_name)
+
+        self.eos_token_id = tokenizer.eos_token_id
+        self.token_trie = TokenTrie(tokenizer)
+        self.tokenizer = tokenizer
+        self.grammar_encoding = grammar_encoding
+
+        pos = 0
+        rules: Dict[int, int] = {}
+
+        while grammar_encoding[pos] != 0xFFFF:
+            rule_id = grammar_encoding[pos]
+
+            # Store the current position in the 'rules' list at the index corresponding to rule_id.
+            # This effectively maps each rule_id to its position in the grammar encoding.
+            rules[rule_id] = pos
+            pos += 1
+
+            # Continue to the next rule in the encoding.
+            # The loop advances by the size indicated at the current position (grammar_encoding[pos])
+            # plus one for the size field itself.
+            while grammar_encoding[pos]:
+                pos += 1 + grammar_encoding[pos]
+            # Now we're at the end of the rule,
+            # so advance to the next rule by skipping the 0, which means 'end of rule'.
+            pos += 1
+
+        self.start_rule_pos = rules[self.start_rule_id]
+        self.rules_pos_dict: Dict[int, int] = rules
+
+    def init_stacks(self):
+        # suppose the start rule position is 0, then grammar_encoding[0] = rule_id
+        # grammar_encoding[1] = rule_size
+        # grammar_encoding[2] = rule_type
+        # this is why we need to add 2 to the start rule position
+        stack = [self.start_rule_pos + 2]
+        # convert to tuple for caching(immutable)
+        return self.advance_stack(tuple(stack))
+
+    # For each stack, resolve rules to find the actual characters that are
+    # accepted by this stack (not the set of sub-rules).
+    # This is where the parsing happens.
+    # The parsing is a top-down, left-to-right, depth-first traversal of the
+    # grammar.
+    @lru_cache(maxsize=32768)
+    def advance_stack(self, stack):
+        stack = list(stack)
+        # If the stack is empty, we're done. Because no more tokens should be accepted.
+        if len(stack) == 0:
+            return [stack]
+
+        # Get the top of the stack.
+        pos = stack[-1]
+
+        # If the stack head is a terminal(literal), we can resolve it immediately.
+        # literal is marked with 2 in the grammar encoding.
+        if self.grammar_encoding[pos] > 1:
+            return [stack]
+
+        # The stack head is a nonterminal (a rule reference, 1 in the grammar encoding).
+        # Resolving this rule gives a set of one or more possible positions
+        # (e.g. two in `a ::= b | c`)
+        # We pop the current rule off the stack and, for each option, push:
+        # - the symbol following this symbol in the current rule; then
+        # - the first symbol of the resolved rule.
+        referenced_rule_id = self.grammar_encoding[pos + 1]
+
+        # subpos should points to the size of the subrule
+        subpos = self.rules_pos_dict[referenced_rule_id] + 1
+        stacks: List[List[int]] = []
+
+        # do depth-first search to find all possible rules and check the next terminal
+        # When this value is non-zero, it indicates that subpos is not yet at the end of the rule, so we can continue.
+        # here subpos is a pointer, and the value in the rule encoding can never be 0 except for the end of the rule.
+        while self.grammar_encoding[subpos]:
+            new_stack = stack[:-1]
+            if self.grammar_encoding[pos + 2]:
+                # check if there is a next symbol in the current rule, e.g. `a ::= b c | d`
+                # if yes, push the pos to rule_size to the stack
+                new_stack.append(pos + 2)
+
+            # if the type of the next symbol is not "empty", push the first symbol of the resolved rule to the stack
+            if self.grammar_encoding[subpos + 1]:
+                new_stack.append(subpos + 1)
+            stacks.extend(self.advance_stack(tuple(new_stack)))
+            # The increment subpos += self.grammar_encoding[subpos] + 1
+            # moves subpos forward in the grammar encoding array to the next alternative in the current rule.
+            subpos += self.grammar_encoding[subpos] + 1
+        return stacks
+
+    def accept_char(self, *args, **kwargs):
+        """Process a byte according to the grammar rules."""
+        raise NotImplementedError
+
+    def accept_token_id(self, *args, **kwargs):
+        """Process a token according to the grammar rules."""
+        raise NotImplementedError
+
+    def filter_vocab(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class IncrementalGrammarConstraint(GrammarConstraint):
+    def __init__(self, grammar_str, start_rule_name, tokenizer):
+        super().__init__(grammar_str, start_rule_name, tokenizer)
+
+    def accept_char(self, byte, stacks):
+        new_stacks = []
+        for stack in stacks:
+            # stack is empty
+            if not stack:
+                continue
+
+            pos = stack[-1]
+            num_chars = self.grammar_encoding[pos]
+
+            # to make pos point to the size of the char range rule
+            pos += 1
+            found = False
+            for i in range(0, num_chars, 2):
+                if self.grammar_encoding[pos + i] <= byte and byte <= self.grammar_encoding[pos + i + 1]:
+                    found = True
+                    break
+            if not found:
+                continue
+
+            pos += num_chars
+            new_stack = stack[:-1]
+            if self.grammar_encoding[pos]:
+                new_stack.append(pos)
+            new_stacks.extend(self.advance_stack(tuple(new_stack)))
+
+        return new_stacks
+
+    def accept_string(self, string: str, stacks: List[List[int]]):
+        _bytes = bytes(string, "utf-8")
+        for byte in _bytes:
+            stacks = self.accept_char(byte, stacks)
+        return stacks
+
+    def accept_token_id(self, token_id: int, stacks: List[List[int]]):
+        if token_id == self.eos_token_id:
+            if stacks and all(len(stack) != 0 for stack in stacks):
+                raise Exception(
+                    f"At least one of the stack should be empty when EOS is reached. However, "
+                    f"the stacks are {stacks}"
+                )
+            return []
+
+        for byte in self.token_trie.id2str(token_id):
+            stacks = self.accept_char(byte, stacks)
+            # check updated stacks
+            # TODO, I commented this out because it will fail when the stack is empty
+            # empty stack means the end of the grammar
+            # assert stacks != []
+
+        return stacks
+
+    def accept_token_ids(self, token_ids: List[int], stacks: List[List[int]], as_string=True):
+        if as_string:
+            string = self.tokenizer.decode(token_ids)
+            stacks = self.accept_string(string, stacks)
+        else:
+            for token_id in token_ids:
+                stacks = self.accept_token_id(token_id, stacks)
+        return stacks
+
+    def batch_filter_vocab(self, batch_stacks, device):
+        batch_acceptance = []
+        for stacks in batch_stacks:
+            batch_acceptance.append(self.filter_vocab(stacks, device))
+        return torch.stack(batch_acceptance)
+
+    def filter_vocab(self, stacks, device):
+        if not stacks:  # Check if stacks is empty
+            # Handle the empty case: for example, return a tensor of False
+            # The size of the tensor should match the size of your vocabulary
+            vocab_size = len(self.token_trie)
+            logger.debug(f"sum of acceptance: {0}")
+            return torch.zeros(vocab_size, dtype=torch.bool, device=device)
+
+        acceptance_matrix = torch.cat([self.token_acceptance_for_stack(tuple(stack), device) for stack in stacks])
+        # Merge stacks: any True => True
+        acceptance = acceptance_matrix.reshape(len(stacks), -1).any(dim=0)
+        logger.debug(f"sum of acceptance: {acceptance.sum()}")
+        return acceptance
+
+    # For each sub-rule in the grammar, cache whether each byte is accepted.
+    @lru_cache(maxsize=None)
+    def pos_char_acceptance(self, pos):
+        acceptance = [False] * 256
+        num_chars = self.grammar_encoding[pos]
+        pos += 1
+        for i in range(0, num_chars, 2):
+            start = self.grammar_encoding[pos + i]
+            end = self.grammar_encoding[pos + i + 1]
+            for j in range(start, end + 1):
+                acceptance[j] = True
+        return acceptance
+
+    # Probably this should be configurable. If the grammar has an exceedingly
+    # large number of states, the correct setting is a tradeoff between GPU
+    # RAM usage and recomputation time.
+    #
+    # The main variable that pushes usage up here is number of states in the
+    # grammar.
+    @lru_cache(maxsize=32768)
+    def token_acceptance_for_stack(self, stack, device):
+        st = time.time()
+        stack = list(stack)  # needs to come in as a tuple for lru_cache
+
+        accepts = [False] * len(self.token_trie)
+        accepts[self.eos_token_id] = len(stack) == 0
+        if len(stack) == 0:
+            logger.debug("empty stack")
+
+        def traverse_trie(trie, stacks):
+            for byte, next_trie in trie.items():
+                if byte == LEAF:
+                    token_id = next_trie
+                    if token_id != self.eos_token_id:
+                        accepts[token_id] = bool(stacks)
+                    continue
+
+                new_stacks = []
+                for stk in stacks:
+                    if not stk:
+                        continue
+
+                    pos = stk[-1]
+                    num_chars = self.grammar_encoding[pos]
+
+                    if not self.pos_char_acceptance(pos)[byte]:
+                        continue
+
+                    pos += num_chars + 1
+                    new_stack = stk[:-1]
+                    if self.grammar_encoding[pos]:
+                        new_stack.append(pos)
+                    new_stacks.extend(self.advance_stack(tuple(new_stack)))
+
+                if new_stacks:
+                    traverse_trie(next_trie, new_stacks)
+
+        traverse_trie(self.token_trie.trie, [stack])
+
+        et = time.time() - st
+        x = torch.tensor(accepts, dtype=torch.bool, device=device)
+        self.tt += et
+        self.nt += 1
+        return x
+
+
+class StaticGrammarConstraint(GrammarConstraint):
+    def __init__(self, grammar_str, start_rule_name, tokenizer):
+        super().__init__(grammar_str, start_rule_name, tokenizer)
+
+    def accept_char(self):
+        raise NotImplementedError
+
+
+#################
+# DATA STRUCTURES
+#################
+
+
+LEAF = -1
+
+
+class TokenTrie:
+    def __init__(self, tokenizer):
+        self.eos_token_id = tokenizer.eos_token_id
+        self.tokens = []
+        self.trie = {}
+        self.load_tokens(tokenizer)
+
+    def id2str(self, token_id):
+        return self.tokens[token_id]
+
+    def __len__(self):
+        return len(self.tokens)
+
+    def load_tokens(self, tokenizer):
+        def replace_hex(match):
+            hex_value = match.group(1)
+            return chr(int(hex_value, 16))
+
+        if "gpt2" in tokenizer.__class__.__name__.lower():
+            special = tokenizer.additional_special_tokens_ids
+
+            # Here, the decoder does a string replace on a bunch of sequences
+            # like ' .' for '.'. This interferes with our assumptions, where a
+            # token should always have exactly one representation.
+            # Fortunately(?) text-generation-inference doesn't seem to run this
+            # cleanup, so we get extraneous spaces. So, in order to generate
+            # the right token set for TGI, we have to skip the space trimming.
+            # See:
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3588-L3600
+            def fmt_token(id):
+                if id in special:
+                    return None
+                return bytes(tokenizer.decode([id], clean_up_tokenization_spaces=False), "utf-8")
+
+        elif "llama" in tokenizer.__class__.__name__.lower():
+
+            def fmt_token(id):
+                token = tokenizer.convert_ids_to_tokens(id)
+                token = re.sub(r"<0x([0-9a-fA-F]{2})>", replace_hex, token)
+                token = token.replace("▁", " ")
+                return bytes(token, "utf-8")
+
+        else:
+            print("Warning: unrecognized tokenizer: using default token formatting")
+
+            def fmt_token(id):
+                token = tokenizer.convert_ids_to_tokens(id)
+                return bytes(token, "utf-8")
+
+        # note: vocab_size doesn't work here because there are also
+        # get_added_vocab() tokens
+        self.tokens = [fmt_token(i) for i in range(len(tokenizer.get_vocab()))]
+        for token_id, token_bytes in enumerate(self.tokens):
+            if token_bytes is not None:
+                self.insert_into_trie(self.trie, token_bytes, token_id)
+
+    def insert_into_trie(self, trie, token_bytes, token_id):
+        current = trie
+        for byte in token_bytes:
+            if byte not in current:
+                current[byte] = {}
+            current = current[byte]
+        current[LEAF] = token_id
+
+
+@lru_cache(maxsize=5)
+def initialize_grammar(grammar_string):
+    return IncrementalGrammarConstraint(grammar_string.strip(), start_rule_name="root", tokenizer=shared.tokenizer)
diff --git a/modules/grammar/logits_process.py b/modules/grammar/logits_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..85e8df687b0f1c1d4bb911dd04349f55b3a354a5
--- /dev/null
+++ b/modules/grammar/logits_process.py
@@ -0,0 +1,104 @@
+'''
+This file has been 100% copied from this PR to the Transformers library:
+https://github.com/huggingface/transformers/pull/27557
+
+Author: Saibo-creator
+Author GitHub: https://github.com/Saibo-creator
+
+All credits go to the author.
+'''
+
+import math
+
+import torch
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.utils import add_start_docstrings
+
+LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
+            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
+            search or log softmax for each vocabulary token when using beam search
+
+    Return:
+        `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
+
+"""
+
+
+class GrammarConstrainedLogitsProcessor(LogitsProcessor):
+    def __init__(self, grammar_constraint):
+        self.last_size = None
+        self.grammar_constraint = grammar_constraint
+        self.batch_stacks = None
+
+    def filter_logits(self, logits, device):
+        # resolve each stack to a tensor of True/False for each token
+        # indicating acceptance
+        # acceptance = self.grammar_acceptor.filter_vocab(self.stacks, device)
+        acceptance = self.grammar_constraint.batch_filter_vocab(self.batch_stacks, device)
+        # logger.debug(acceptance)
+        # Logits to -inf where False
+        logits[~acceptance] = -math.inf
+
+    # TODO: batching
+    def process_logits(self, input_ids, scores, parse_start_index=None):
+        """
+        :param input_ids:
+        :param scores:
+        :param parse_start_index: default None, which means generate from scratch. Set to 0 to parse all input_ids
+        :return:
+        """
+        # we dynamically create stacks at the first call, so that we know the batch size and beam size
+        if self.batch_stacks is None:
+            self.batch_stacks = [self.grammar_constraint.init_stacks() for _ in range(len(input_ids))]
+
+        # if self.last_size is not set (which would be the case when processing the first token).
+        # In this case, do nothing.
+        if self.last_size is None:
+            prefix_to_parse = [
+                single_input_ids[parse_start_index:] if parse_start_index is not None else []
+                for single_input_ids in input_ids
+            ]
+            # self.grammar_acceptor.accept_token_ids(prefix_to_parse, self.stacks)
+            self.batch_stacks = [
+                self.grammar_constraint.accept_token_ids(prefix, stack)
+                for prefix, stack in zip(prefix_to_parse, self.batch_stacks)
+            ]
+        #  if the length of the current input IDs (input_ids[0]) is exactly one more than self.last_size.
+        #  This is expected in a scenario where inputs are processed incrementally, one token at a time.
+        elif len(input_ids[0]) == self.last_size + 1:
+            # self.stacks = self.grammar_acceptor.accept_token_id(input_ids[0][-1], self.stacks)
+            self.batch_stacks = [
+                self.grammar_constraint.accept_token_id(single_input_ids[-1], stack)
+                for single_input_ids, stack in zip(input_ids, self.batch_stacks)
+            ]
+        #  ensure that the input size is consistent with the expected incremental processing
+        #  (i.e., one token at a time).
+        else:
+            # here we check if the input_ids are one token longer than the last time we processed
+            # but we don't check if input_ids are actually valid.
+            # Imagine a scenario where we generate 10 tokens, then we replace the 10 generated tokens with 10 new tokens.
+            # In this case, the input_ids will be consistent with the last_size, but the input_ids are not valid.
+            # However, should we really check if the input_ids are valid here?
+            # If we do, then we need to reparse the whole input_ids at each call, which is not efficient.
+            # Maybe we should just trust the user to provide valid input_ids?
+            # The conclusion is that, we assume the input_ids are valid, and our generation will be correct.
+            # If the input_ids are not valid, then the generation result will be wrong and we don't take responsibility for that.
+            raise RuntimeError(
+                "Input ID's length is inconsistent with the current state of "
+                "the GrammarConstrainedLogitsProcessor. If you want to process "
+                "another input sequence, please instantiate a new "
+                "GrammarConstrainedLogitsProcessor."
+            )
+
+        self.filter_logits(scores, scores.device)
+
+        self.last_size = len(input_ids[0])
+        return scores
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        return self.process_logits(input_ids, scores)
diff --git a/modules/html_generator.py b/modules/html_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3dd453ef57e553986ced5adcffd4cf30a190a16
--- /dev/null
+++ b/modules/html_generator.py
@@ -0,0 +1,309 @@
+import html
+import os
+import re
+import time
+from pathlib import Path
+
+import markdown
+from PIL import Image, ImageOps
+
+from modules import shared
+from modules.utils import get_available_chat_styles
+
+# This is to store the paths to the thumbnails of the profile pictures
+image_cache = {}
+
+with open(Path(__file__).resolve().parent / '../css/html_readable_style.css', 'r') as f:
+    readable_css = f.read()
+with open(Path(__file__).resolve().parent / '../css/html_4chan_style.css', 'r') as css_f:
+    _4chan_css = css_f.read()
+with open(Path(__file__).resolve().parent / '../css/html_instruct_style.css', 'r') as f:
+    instruct_css = f.read()
+
+# Custom chat styles
+chat_styles = {}
+for k in get_available_chat_styles():
+    chat_styles[k] = open(Path(f'css/chat_style-{k}.css'), 'r').read()
+
+# Handle styles that derive from other styles
+for k in chat_styles:
+    lines = chat_styles[k].split('\n')
+    input_string = lines[0]
+    match = re.search(r'chat_style-([a-z\-]*)\.css', input_string)
+
+    if match:
+        style = match.group(1)
+        chat_styles[k] = chat_styles.get(style, '') + '\n\n' + '\n'.join(lines[1:])
+
+
+def fix_newlines(string):
+    string = string.replace('\n', '\n\n')
+    string = re.sub(r"\n{3,}", "\n\n", string)
+    string = string.strip()
+    return string
+
+
+def replace_blockquote(m):
+    return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
+
+
+def convert_to_markdown(string):
+
+    # Blockquote
+    string = re.sub(r'(^|[\n])&gt;', r'\1>', string)
+    pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
+    string = pattern.sub(replace_blockquote, string)
+
+    # Code
+    string = string.replace('\\begin{code}', '```')
+    string = string.replace('\\end{code}', '```')
+    string = re.sub(r"(.)```", r"\1\n```", string)
+
+    result = ''
+    is_code = False
+    for line in string.split('\n'):
+        if line.lstrip(' ').startswith('```'):
+            is_code = not is_code
+
+        result += line
+        if is_code or line.startswith('|'):  # Don't add an extra \n for tables or code
+            result += '\n'
+        else:
+            result += '\n\n'
+
+    result = result.strip()
+    if is_code:
+        result += '\n```'  # Unfinished code block
+
+    # Unfinished list, like "\n1.". A |delete| string is added and then
+    # removed to force a <ol> or <ul> to be generated instead of a <p>.
+    if re.search(r'(\n\d+\.?|\n\*\s*)$', result):
+        delete_str = '|delete|'
+
+        if re.search(r'(\d+\.?)$', result) and not result.endswith('.'):
+            result += '.'
+
+        result = re.sub(r'(\n\d+\.?|\n\*\s*)$', r'\g<1> ' + delete_str, result)
+
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+        pos = html_output.rfind(delete_str)
+        if pos > -1:
+            html_output = html_output[:pos] + html_output[pos + len(delete_str):]
+    else:
+        html_output = markdown.markdown(result, extensions=['fenced_code', 'tables'])
+
+    # Unescape code blocks
+    pattern = re.compile(r'<code[^>]*>(.*?)</code>', re.DOTALL)
+    html_output = pattern.sub(lambda x: html.unescape(x.group()), html_output)
+
+    return html_output
+
+
+def generate_basic_html(string):
+    string = convert_to_markdown(string)
+    string = f'<style>{readable_css}</style><div class="readable-container">{string}</div>'
+    return string
+
+
+def process_post(post, c):
+    t = post.split('\n')
+    number = t[0].split(' ')[1]
+    if len(t) > 1:
+        src = '\n'.join(t[1:])
+    else:
+        src = ''
+    src = re.sub('>', '&gt;', src)
+    src = re.sub('(&gt;&gt;[0-9]*)', '<span class="quote">\\1</span>', src)
+    src = re.sub('\n', '<br>\n', src)
+    src = f'<blockquote class="message_4chan">{src}\n'
+    src = f'<span class="name">Anonymous </span> <span class="number">No.{number}</span>\n{src}'
+    return src
+
+
+def generate_4chan_html(f):
+    posts = []
+    post = ''
+    c = -2
+    for line in f.splitlines():
+        line += "\n"
+        if line == '-----\n':
+            continue
+        elif line.startswith('--- '):
+            c += 1
+            if post != '':
+                src = process_post(post, c)
+                posts.append(src)
+            post = line
+        else:
+            post += line
+
+    if post != '':
+        src = process_post(post, c)
+        posts.append(src)
+
+    for i in range(len(posts)):
+        if i == 0:
+            posts[i] = f'<div class="op">{posts[i]}</div>\n'
+        else:
+            posts[i] = f'<div class="reply">{posts[i]}</div>\n'
+
+    output = ''
+    output += f'<style>{_4chan_css}</style><div id="parent"><div id="container">'
+    for post in posts:
+        output += post
+
+    output += '</div></div>'
+    output = output.split('\n')
+    for i in range(len(output)):
+        output[i] = re.sub(r'^(&gt;(.*?)(<br>|</div>))', r'<span class="greentext">\1</span>', output[i])
+        output[i] = re.sub(r'^<blockquote class="message_4chan">(&gt;(.*?)(<br>|</div>))', r'<blockquote class="message_4chan"><span class="greentext">\1</span>', output[i])
+
+    output = '\n'.join(output)
+    return output
+
+
+def make_thumbnail(image):
+    image = image.resize((350, round(image.size[1] / image.size[0] * 350)), Image.Resampling.LANCZOS)
+    if image.size[1] > 470:
+        image = ImageOps.fit(image, (350, 470), Image.LANCZOS)
+
+    return image
+
+
+def get_image_cache(path):
+    cache_folder = Path(shared.args.disk_cache_dir)
+    if not cache_folder.exists():
+        cache_folder.mkdir()
+
+    mtime = os.stat(path).st_mtime
+    if (path in image_cache and mtime != image_cache[path][0]) or (path not in image_cache):
+        img = make_thumbnail(Image.open(path))
+
+        old_p = Path(f'{cache_folder}/{path.name}_cache.png')
+        p = Path(f'{cache_folder}/cache_{path.name}.png')
+        if old_p.exists():
+            old_p.rename(p)
+
+        output_file = p
+        img.convert('RGB').save(output_file, format='PNG')
+        image_cache[path] = [mtime, output_file.as_posix()]
+
+    return image_cache[path][1]
+
+
+def generate_instruct_html(history):
+    output = f'<style>{instruct_css}</style><div class="chat" id="chat"><div class="messages">'
+    for i, _row in enumerate(history):
+        row = [convert_to_markdown(entry) for entry in _row]
+
+        if row[0]:  # don't display empty user messages
+            output += f"""
+                  <div class="user-message">
+                    <div class="text">
+                      <div class="message-body">
+                        {row[0]}
+                      </div>
+                    </div>
+                  </div>
+                """
+
+        output += f"""
+              <div class="assistant-message">
+                <div class="text">
+                  <div class="message-body">
+                    {row[1]}
+                  </div>
+                </div>
+              </div>
+            """
+
+    output += "</div></div>"
+
+    return output
+
+
+def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=False):
+    output = f'<style>{chat_styles[style]}</style><div class="chat" id="chat"><div class="messages">'
+
+    # We use ?character and ?time.time() to force the browser to reset caches
+    img_bot = f'<img src="file/cache/pfp_character_thumb.png?{character}" class="pfp_character">' if Path("cache/pfp_character_thumb.png").exists() else ''
+    img_me = f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">' if Path("cache/pfp_me.png").exists() else ''
+
+    for i, _row in enumerate(history):
+        row = [convert_to_markdown(entry) for entry in _row]
+
+        if row[0]:  # don't display empty user messages
+            output += f"""
+                  <div class="message">
+                    <div class="circle-you">
+                      {img_me}
+                    </div>
+                    <div class="text">
+                      <div class="username">
+                        {name1}
+                      </div>
+                      <div class="message-body">
+                        {row[0]}
+                      </div>
+                    </div>
+                  </div>
+                """
+
+        output += f"""
+              <div class="message">
+                <div class="circle-bot">
+                  {img_bot}
+                </div>
+                <div class="text">
+                  <div class="username">
+                    {name2}
+                  </div>
+                  <div class="message-body">
+                    {row[1]}
+                  </div>
+                </div>
+              </div>
+            """
+
+    output += "</div></div>"
+    return output
+
+
+def generate_chat_html(history, name1, name2, reset_cache=False):
+    output = f'<style>{chat_styles["wpp"]}</style><div class="chat" id="chat"><div class="messages">'
+
+    for i, _row in enumerate(history):
+        row = [convert_to_markdown(entry) for entry in _row]
+
+        if row[0]:  # don't display empty user messages
+            output += f"""
+              <div class="message">
+                <div class="text-you">
+                  <div class="message-body">
+                    {row[0]}
+                  </div>
+                </div>
+              </div>
+            """
+
+        output += f"""
+          <div class="message">
+            <div class="text-bot">
+              <div class="message-body">
+                {row[1]}
+              </div>
+            </div>
+          </div>
+        """
+
+    output += "</div></div>"
+    return output
+
+
+def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False):
+    if mode == 'instruct':
+        return generate_instruct_html(history['visible'])
+    elif style == 'wpp':
+        return generate_chat_html(history['visible'], name1, name2)
+    else:
+        return generate_cai_chat_html(history['visible'], name1, name2, style, character, reset_cache)
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..4726669b373eab5ea7b49f644ba929fb0d8f1588
--- /dev/null
+++ b/modules/llamacpp_hf.py
@@ -0,0 +1,226 @@
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import torch
+from torch.nn import CrossEntropyLoss
+from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from modules import RoPE, shared
+from modules.logging_colors import logger
+
+try:
+    import llama_cpp
+except:
+    llama_cpp = None
+
+try:
+    import llama_cpp_cuda
+except:
+    llama_cpp_cuda = None
+
+try:
+    import llama_cpp_cuda_tensorcores
+except:
+    llama_cpp_cuda_tensorcores = None
+
+
+def llama_cpp_lib():
+    if shared.args.cpu and llama_cpp is not None:
+        return llama_cpp
+    elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
+        return llama_cpp_cuda_tensorcores
+    elif llama_cpp_cuda is not None:
+        return llama_cpp_cuda
+    else:
+        return llama_cpp
+
+
+class LlamacppHF(PreTrainedModel):
+    def __init__(self, model, path):
+        super().__init__(PretrainedConfig())
+        self.model = model
+        self.generation_config = GenerationConfig()
+
+        self.past_seq = None
+        self.llamacpp_cache = {
+            'n_tokens': self.model.n_tokens,
+            'input_ids': self.model.input_ids,
+            'scores': self.model.scores,
+            'ctx': self.model._ctx
+        }
+
+        if shared.args.cfg_cache:
+            self.past_seq_negative = None
+            self.llamacpp_cache_negative = {
+                'n_tokens': self.model.n_tokens,
+                'input_ids': self.model.input_ids.copy(),
+                'scores': self.model.scores.copy(),
+                'ctx': llama_cpp_lib().llama_new_context_with_model(model.model, model.context_params)
+            }
+
+    def _validate_model_class(self):
+        pass
+
+    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
+        pass
+
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {'input_ids': input_ids, **kwargs}
+
+    def save_cache(self):
+        self.llamacpp_cache.update({
+            'n_tokens': self.model.n_tokens,
+            'input_ids': self.model.input_ids,
+            'scores': self.model.scores,
+            'ctx': self.model._ctx
+        })
+
+    def save_negative_cache(self):
+        self.llamacpp_cache_negative.update({
+            'n_tokens': self.model.n_tokens,
+            'input_ids': self.model.input_ids,
+            'scores': self.model.scores,
+            'ctx': self.model._ctx
+        })
+
+    def load_cache(self):
+        self.model.n_tokens = self.llamacpp_cache['n_tokens']
+        self.model.input_ids = self.llamacpp_cache['input_ids']
+        self.model.scores = self.llamacpp_cache['scores']
+        self.model._ctx = self.llamacpp_cache['ctx']
+
+    def load_negative_cache(self):
+        self.model.n_tokens = self.llamacpp_cache_negative['n_tokens']
+        self.model.input_ids = self.llamacpp_cache_negative['input_ids']
+        self.model.scores = self.llamacpp_cache_negative['scores']
+        self.model._ctx = self.llamacpp_cache_negative['ctx']
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(0)
+
+    def __call__(self, *args, **kwargs):
+        use_cache = kwargs.get('use_cache', True)
+        labels = kwargs.get('labels', None)
+        past_key_values = kwargs.get('past_key_values', None)
+
+        if len(args) > 0:
+            if not shared.args.cfg_cache:
+                logger.error("Please enable the cfg-cache option to use CFG with llamacpp_HF.")
+                return
+
+            input_ids = args[0]
+            is_negative = True
+            past_seq = self.past_seq_negative
+            self.load_negative_cache()
+        else:
+            input_ids = kwargs['input_ids']
+            is_negative = False
+            past_seq = self.past_seq
+            self.load_cache()
+
+        seq = input_ids[0].tolist()
+        if is_negative and past_key_values is not None:
+            seq = past_key_values + seq
+
+        seq_tensor = torch.tensor(seq)
+        reset = True
+
+        # Make the forward call. The prefix-match code has been adapted from
+        # https://github.com/abetlen/llama-cpp-python/commit/f4090a0bb2a2a25acfe28d31c82cc1aa273bedee
+        if labels is None:
+            if past_seq is not None:
+                min_length = min(past_seq.shape[0], seq_tensor.shape[0])
+                indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
+                if len(indices) > 0:
+                    longest_prefix = indices[0].item()
+                else:
+                    longest_prefix = min_length
+
+                if longest_prefix > 0:
+                    reset = False
+                    self.model.n_tokens = longest_prefix
+                    if len(seq_tensor) - longest_prefix > 0:
+                        self.model.eval(seq[longest_prefix:])
+                    else:
+                        self.model.n_tokens -= 1
+                        self.model.eval([seq[-1]])
+
+            if reset:
+                self.model.reset()
+                self.model.eval(seq)
+
+            logits = torch.tensor(self.model.scores[self.model.n_tokens - 1, :]).view(1, 1, -1).to(input_ids.device)
+        else:
+            self.model.reset()
+            self.model.eval(seq)
+            logits = torch.tensor(self.model.eval_logits)
+            logits = logits.view(1, logits.shape[0], logits.shape[1]).to(input_ids.device)
+
+        if is_negative:
+            self.save_negative_cache()
+            self.past_seq_negative = seq_tensor
+        else:
+            self.save_cache()
+            self.past_seq = seq_tensor
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, logits.shape[-1])
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
+        assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
+
+        if isinstance(pretrained_model_name_or_path, str):
+            pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
+
+        path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
+        if path.is_file():
+            model_file = path
+        else:
+            model_file = list(path.glob('*.gguf'))[0]
+
+        logger.info(f"llama.cpp weights detected: {model_file}\n")
+
+        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
+            tensor_split_list = None
+        else:
+            tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
+
+        params = {
+            'model_path': str(model_file),
+            'n_ctx': shared.args.n_ctx,
+            'n_threads': shared.args.threads or None,
+            'n_threads_batch': shared.args.threads_batch or None,
+            'n_batch': shared.args.n_batch,
+            'use_mmap': not shared.args.no_mmap,
+            'use_mlock': shared.args.mlock,
+            'mul_mat_q': not shared.args.no_mul_mat_q,
+            'numa': shared.args.numa,
+            'n_gpu_layers': shared.args.n_gpu_layers,
+            'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
+            'tensor_split': tensor_split_list,
+            'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
+            'logits_all': shared.args.logits_all,
+            'offload_kqv': not shared.args.no_offload_kqv,
+            'split_mode': 1 if not shared.args.row_split else 2
+        }
+
+        Llama = llama_cpp_lib().Llama
+        model = Llama(**params)
+
+        return LlamacppHF(model, model_file)
diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c405a4baea76770ce1b2746e5e340ebaf727ed7
--- /dev/null
+++ b/modules/llamacpp_model.py
@@ -0,0 +1,190 @@
+import re
+from functools import partial
+
+import numpy as np
+import torch
+
+from modules import RoPE, shared
+from modules.callbacks import Iteratorize
+from modules.logging_colors import logger
+from modules.text_generation import get_max_prompt_length
+
+try:
+    import llama_cpp
+except:
+    llama_cpp = None
+
+try:
+    import llama_cpp_cuda
+except:
+    llama_cpp_cuda = None
+
+try:
+    import llama_cpp_cuda_tensorcores
+except:
+    llama_cpp_cuda_tensorcores = None
+
+
+def llama_cpp_lib():
+    if shared.args.cpu and llama_cpp is not None:
+        return llama_cpp
+    elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
+        return llama_cpp_cuda_tensorcores
+    elif llama_cpp_cuda is not None:
+        return llama_cpp_cuda
+    else:
+        return llama_cpp
+
+
+def ban_eos_logits_processor(eos_token, input_ids, logits):
+    logits[eos_token] = -float('inf')
+    return logits
+
+
+def custom_token_ban_logits_processor(token_ids, input_ids, logits):
+    for token_id in token_ids:
+        logits[token_id] = -float('inf')
+
+    return logits
+
+
+class LlamaCppModel:
+    def __init__(self):
+        self.initialized = False
+        self.grammar_string = ''
+        self.grammar = None
+
+    def __del__(self):
+        del self.model
+
+    @classmethod
+    def from_pretrained(self, path):
+
+        Llama = llama_cpp_lib().Llama
+        LlamaCache = llama_cpp_lib().LlamaCache
+
+        result = self()
+        cache_capacity = 0
+        if shared.args.cache_capacity is not None:
+            if 'GiB' in shared.args.cache_capacity:
+                cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000 * 1000
+            elif 'MiB' in shared.args.cache_capacity:
+                cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000
+            else:
+                cache_capacity = int(shared.args.cache_capacity)
+
+        if cache_capacity > 0:
+            logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
+
+        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
+            tensor_split_list = None
+        else:
+            tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
+
+        params = {
+            'model_path': str(path),
+            'n_ctx': shared.args.n_ctx,
+            'n_threads': shared.args.threads or None,
+            'n_threads_batch': shared.args.threads_batch or None,
+            'n_batch': shared.args.n_batch,
+            'use_mmap': not shared.args.no_mmap,
+            'use_mlock': shared.args.mlock,
+            'mul_mat_q': not shared.args.no_mul_mat_q,
+            'numa': shared.args.numa,
+            'n_gpu_layers': shared.args.n_gpu_layers,
+            'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
+            'tensor_split': tensor_split_list,
+            'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
+            'offload_kqv': not shared.args.no_offload_kqv,
+            'split_mode': 1 if not shared.args.row_split else 2
+        }
+
+        result.model = Llama(**params)
+        if cache_capacity > 0:
+            result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
+
+        # This is ugly, but the model and the tokenizer are the same object in this library.
+        return result, result
+
+    def encode(self, string):
+        if type(string) is str:
+            string = string.encode()
+
+        return self.model.tokenize(string)
+
+    def decode(self, ids, **kwargs):
+        return self.model.detokenize(ids).decode('utf-8')
+
+    def get_logits(self, tokens):
+        self.model.reset()
+        self.model.eval(tokens)
+        logits = self.model._scores
+        logits = np.expand_dims(logits, 0)  # batch dim is expected
+        return torch.tensor(logits, dtype=torch.float32)
+
+    def load_grammar(self, string):
+        if string != self.grammar_string:
+            self.grammar_string = string
+            if string.strip() != '':
+                self.grammar = llama_cpp_lib().LlamaGrammar.from_string(string)
+            else:
+                self.grammar = None
+
+    def generate(self, prompt, state, callback=None):
+        LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
+        prompt = prompt if type(prompt) is str else prompt.decode()
+
+        # Handle truncation
+        prompt = self.encode(prompt)
+        prompt = prompt[-get_max_prompt_length(state):]
+        prompt = self.decode(prompt)
+
+        self.load_grammar(state['grammar_string'])
+        logit_processors = LogitsProcessorList()
+        if state['ban_eos_token']:
+            logit_processors.append(partial(ban_eos_logits_processor, self.model.token_eos()))
+
+        if state['custom_token_bans']:
+            to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
+            if len(to_ban) > 0:
+                logit_processors.append(partial(custom_token_ban_logits_processor, to_ban))
+
+        completion_chunks = self.model.create_completion(
+            prompt=prompt,
+            max_tokens=state['max_new_tokens'],
+            temperature=state['temperature'],
+            top_p=state['top_p'],
+            min_p=state['min_p'],
+            typical_p=state['typical_p'],
+            frequency_penalty=state['frequency_penalty'],
+            presence_penalty=state['presence_penalty'],
+            repeat_penalty=state['repetition_penalty'],
+            top_k=state['top_k'],
+            stream=True,
+            seed=int(state['seed']) if state['seed'] != -1 else None,
+            tfs_z=state['tfs'],
+            mirostat_mode=int(state['mirostat_mode']),
+            mirostat_tau=state['mirostat_tau'],
+            mirostat_eta=state['mirostat_eta'],
+            logits_processor=logit_processors,
+            grammar=self.grammar
+        )
+
+        output = ""
+        for completion_chunk in completion_chunks:
+            if shared.stop_everything:
+                break
+
+            text = completion_chunk['choices'][0]['text']
+            output += text
+            if callback:
+                callback(text)
+
+        return output
+
+    def generate_with_streaming(self, *args, **kwargs):
+        with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
+            reply = ''
+            for token in generator:
+                reply += token
+                yield reply
diff --git a/modules/loaders.py b/modules/loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..26b7c5e28368abc57462cbdc8923642ec762581b
--- /dev/null
+++ b/modules/loaders.py
@@ -0,0 +1,424 @@
+import functools
+from collections import OrderedDict
+
+import gradio as gr
+
+from modules import shared
+
+loaders_and_params = OrderedDict({
+    'Transformers': [
+        'cpu_memory',
+        'gpu_memory',
+        'load_in_8bit',
+        'bf16',
+        'cpu',
+        'disk',
+        'auto_devices',
+        'load_in_4bit',
+        'use_double_quant',
+        'quant_type',
+        'compute_dtype',
+        'trust_remote_code',
+        'no_use_fast',
+        'use_flash_attention_2',
+        'alpha_value',
+        'rope_freq_base',
+        'compress_pos_emb',
+        'disable_exllama',
+        'disable_exllamav2',
+        'transformers_info',
+    ],
+    'llama.cpp': [
+        'n_ctx',
+        'n_gpu_layers',
+        'tensor_split',
+        'n_batch',
+        'threads',
+        'threads_batch',
+        'no_mmap',
+        'mlock',
+        'no_mul_mat_q',
+        'alpha_value',
+        'rope_freq_base',
+        'compress_pos_emb',
+        'cpu',
+        'numa',
+        'no_offload_kqv',
+        'row_split',
+        'tensorcores',
+    ],
+    'llamacpp_HF': [
+        'n_ctx',
+        'n_gpu_layers',
+        'tensor_split',
+        'n_batch',
+        'threads',
+        'threads_batch',
+        'no_mmap',
+        'mlock',
+        'no_mul_mat_q',
+        'alpha_value',
+        'rope_freq_base',
+        'compress_pos_emb',
+        'cpu',
+        'numa',
+        'cfg_cache',
+        'trust_remote_code',
+        'no_use_fast',
+        'logits_all',
+        'no_offload_kqv',
+        'row_split',
+        'tensorcores',
+        'llamacpp_HF_info',
+    ],
+    'ExLlamav2_HF': [
+        'gpu_split',
+        'max_seq_len',
+        'cfg_cache',
+        'no_flash_attn',
+        'num_experts_per_token',
+        'cache_8bit',
+        'alpha_value',
+        'compress_pos_emb',
+        'trust_remote_code',
+        'no_use_fast',
+    ],
+    'ExLlamav2': [
+        'gpu_split',
+        'max_seq_len',
+        'no_flash_attn',
+        'num_experts_per_token',
+        'cache_8bit',
+        'alpha_value',
+        'compress_pos_emb',
+        'exllamav2_info',
+    ],
+    'AutoGPTQ': [
+        'triton',
+        'no_inject_fused_attention',
+        'no_inject_fused_mlp',
+        'no_use_cuda_fp16',
+        'wbits',
+        'groupsize',
+        'desc_act',
+        'disable_exllama',
+        'disable_exllamav2',
+        'gpu_memory',
+        'cpu_memory',
+        'cpu',
+        'disk',
+        'auto_devices',
+        'trust_remote_code',
+        'no_use_fast',
+        'autogptq_info',
+    ],
+    'AutoAWQ': [
+        'cpu_memory',
+        'gpu_memory',
+        'auto_devices',
+        'max_seq_len',
+        'no_inject_fused_attention',
+        'trust_remote_code',
+        'no_use_fast',
+    ],
+    'GPTQ-for-LLaMa': [
+        'wbits',
+        'groupsize',
+        'model_type',
+        'pre_layer',
+        'trust_remote_code',
+        'no_use_fast',
+        'gptq_for_llama_info',
+    ],
+    'ctransformers': [
+        'n_ctx',
+        'n_gpu_layers',
+        'n_batch',
+        'threads',
+        'model_type',
+        'no_mmap',
+        'mlock'
+    ],
+    'QuIP#': [
+        'trust_remote_code',
+        'no_use_fast',
+        'no_flash_attn',
+        'quipsharp_info',
+    ],
+    'HQQ': [
+        'hqq_backend',
+        'trust_remote_code',
+        'no_use_fast',
+    ]
+})
+
+
+def transformers_samplers():
+    return {
+        'temperature',
+        'temperature_last',
+        'dynamic_temperature',
+        'dynatemp_low',
+        'dynatemp_high',
+        'dynatemp_exponent',
+        'smoothing_factor',
+        'top_p',
+        'min_p',
+        'top_k',
+        'typical_p',
+        'epsilon_cutoff',
+        'eta_cutoff',
+        'tfs',
+        'top_a',
+        'repetition_penalty',
+        'presence_penalty',
+        'frequency_penalty',
+        'repetition_penalty_range',
+        'encoder_repetition_penalty',
+        'no_repeat_ngram_size',
+        'min_length',
+        'seed',
+        'do_sample',
+        'penalty_alpha',
+        'num_beams',
+        'length_penalty',
+        'early_stopping',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'grammar_file_row',
+        'grammar_string',
+        'guidance_scale',
+        'negative_prompt',
+        'ban_eos_token',
+        'custom_token_bans',
+        'sampler_priority',
+        'add_bos_token',
+        'skip_special_tokens',
+        'auto_max_new_tokens',
+        'prompt_lookup_num_tokens'
+    }
+
+
+loaders_samplers = {
+    'Transformers': transformers_samplers(),
+    'AutoGPTQ': transformers_samplers(),
+    'GPTQ-for-LLaMa': transformers_samplers(),
+    'AutoAWQ': transformers_samplers(),
+    'QuIP#': transformers_samplers(),
+    'HQQ': transformers_samplers(),
+    'ExLlamav2': {
+        'temperature',
+        'temperature_last',
+        'top_p',
+        'min_p',
+        'top_k',
+        'typical_p',
+        'tfs',
+        'top_a',
+        'repetition_penalty',
+        'presence_penalty',
+        'frequency_penalty',
+        'repetition_penalty_range',
+        'seed',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'ban_eos_token',
+        'add_bos_token',
+        'custom_token_bans',
+        'skip_special_tokens',
+        'auto_max_new_tokens',
+    },
+    'ExLlamav2_HF': {
+        'temperature',
+        'temperature_last',
+        'dynamic_temperature',
+        'dynatemp_low',
+        'dynatemp_high',
+        'dynatemp_exponent',
+        'smoothing_factor',
+        'top_p',
+        'min_p',
+        'top_k',
+        'typical_p',
+        'epsilon_cutoff',
+        'eta_cutoff',
+        'tfs',
+        'top_a',
+        'repetition_penalty',
+        'presence_penalty',
+        'frequency_penalty',
+        'repetition_penalty_range',
+        'encoder_repetition_penalty',
+        'no_repeat_ngram_size',
+        'min_length',
+        'seed',
+        'do_sample',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'grammar_file_row',
+        'grammar_string',
+        'guidance_scale',
+        'negative_prompt',
+        'ban_eos_token',
+        'custom_token_bans',
+        'sampler_priority',
+        'add_bos_token',
+        'skip_special_tokens',
+        'auto_max_new_tokens',
+    },
+    'llama.cpp': {
+        'temperature',
+        'top_p',
+        'min_p',
+        'top_k',
+        'typical_p',
+        'tfs',
+        'repetition_penalty',
+        'presence_penalty',
+        'frequency_penalty',
+        'seed',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'grammar_file_row',
+        'grammar_string',
+        'ban_eos_token',
+        'custom_token_bans',
+    },
+    'llamacpp_HF': {
+        'temperature',
+        'temperature_last',
+        'dynamic_temperature',
+        'dynatemp_low',
+        'dynatemp_high',
+        'dynatemp_exponent',
+        'smoothing_factor',
+        'top_p',
+        'min_p',
+        'top_k',
+        'typical_p',
+        'epsilon_cutoff',
+        'eta_cutoff',
+        'tfs',
+        'top_a',
+        'repetition_penalty',
+        'presence_penalty',
+        'frequency_penalty',
+        'repetition_penalty_range',
+        'encoder_repetition_penalty',
+        'no_repeat_ngram_size',
+        'min_length',
+        'seed',
+        'do_sample',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'grammar_file_row',
+        'grammar_string',
+        'guidance_scale',
+        'negative_prompt',
+        'ban_eos_token',
+        'custom_token_bans',
+        'sampler_priority',
+        'add_bos_token',
+        'skip_special_tokens',
+        'auto_max_new_tokens',
+    },
+    'ctransformers': {
+        'temperature',
+        'top_p',
+        'top_k',
+        'repetition_penalty',
+        'repetition_penalty_range',
+    },
+}
+
+loaders_model_types = {
+    'GPTQ-for-LLaMa': [
+        "None",
+        "llama",
+        "opt",
+        "gptj"
+    ],
+    'ctransformers': [
+        "None",
+        "gpt2",
+        "gptj",
+        "gptneox",
+        "llama",
+        "mpt",
+        "dollyv2",
+        "replit",
+        "starcoder",
+        "gptbigcode",
+        "falcon"
+    ],
+}
+
+
+@functools.cache
+def list_all_samplers():
+    all_samplers = set()
+    for k in loaders_samplers:
+        for sampler in loaders_samplers[k]:
+            all_samplers.add(sampler)
+
+    return sorted(all_samplers)
+
+
+def blacklist_samplers(loader, dynamic_temperature):
+    all_samplers = list_all_samplers()
+    output = []
+
+    for sampler in all_samplers:
+        if loader == 'All' or sampler in loaders_samplers[loader]:
+            if sampler.startswith('dynatemp'):
+                output.append(gr.update(visible=dynamic_temperature))
+            else:
+                output.append(gr.update(visible=True))
+        else:
+            output.append(gr.update(visible=False))
+
+    return output
+
+
+def get_model_types(loader):
+    if loader in loaders_model_types:
+        return loaders_model_types[loader]
+
+    return ["None"]
+
+
+def get_gpu_memory_keys():
+    return [k for k in shared.gradio if k.startswith('gpu_memory')]
+
+
+@functools.cache
+def get_all_params():
+    all_params = set()
+    for k in loaders_and_params:
+        for el in loaders_and_params[k]:
+            all_params.add(el)
+
+    if 'gpu_memory' in all_params:
+        all_params.remove('gpu_memory')
+        for k in get_gpu_memory_keys():
+            all_params.add(k)
+
+    return sorted(all_params)
+
+
+def make_loader_params_visible(loader):
+    params = []
+    all_params = get_all_params()
+    if loader in loaders_and_params:
+        params = loaders_and_params[loader]
+
+        if 'gpu_memory' in params:
+            params.remove('gpu_memory')
+            params += get_gpu_memory_keys()
+
+    return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params]
diff --git a/modules/logging_colors.py b/modules/logging_colors.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9791e26859621fcab72885ef3fb978478854646
--- /dev/null
+++ b/modules/logging_colors.py
@@ -0,0 +1,67 @@
+import logging
+
+logger = logging.getLogger('text-generation-webui')
+
+
+def setup_logging():
+    '''
+    Copied from: https://github.com/vladmandic/automatic
+
+    All credits to vladmandic.
+    '''
+
+    class RingBuffer(logging.StreamHandler):
+        def __init__(self, capacity):
+            super().__init__()
+            self.capacity = capacity
+            self.buffer = []
+            self.formatter = logging.Formatter('{ "asctime":"%(asctime)s", "created":%(created)f, "facility":"%(name)s", "pid":%(process)d, "tid":%(thread)d, "level":"%(levelname)s", "module":"%(module)s", "func":"%(funcName)s", "msg":"%(message)s" }')
+
+        def emit(self, record):
+            msg = self.format(record)
+            # self.buffer.append(json.loads(msg))
+            self.buffer.append(msg)
+            if len(self.buffer) > self.capacity:
+                self.buffer.pop(0)
+
+        def get(self):
+            return self.buffer
+
+    from rich.console import Console
+    from rich.logging import RichHandler
+    from rich.pretty import install as pretty_install
+    from rich.theme import Theme
+    from rich.traceback import install as traceback_install
+
+    level = logging.DEBUG
+    logger.setLevel(logging.DEBUG)  # log to file is always at level debug for facility `sd`
+    console = Console(log_time=True, log_time_format='%H:%M:%S-%f', theme=Theme({
+        "traceback.border": "black",
+        "traceback.border.syntax_error": "black",
+        "inspect.value.border": "black",
+    }))
+    logging.basicConfig(level=logging.ERROR, format='%(asctime)s | %(name)s | %(levelname)s | %(module)s | %(message)s', handlers=[logging.NullHandler()])  # redirect default logger to null
+    pretty_install(console=console)
+    traceback_install(console=console, extra_lines=1, max_frames=10, width=console.width, word_wrap=False, indent_guides=False, suppress=[])
+    while logger.hasHandlers() and len(logger.handlers) > 0:
+        logger.removeHandler(logger.handlers[0])
+
+    # handlers
+    rh = RichHandler(show_time=True, omit_repeated_times=False, show_level=True, show_path=False, markup=False, rich_tracebacks=True, log_time_format='%H:%M:%S-%f', level=level, console=console)
+    rh.setLevel(level)
+    logger.addHandler(rh)
+
+    rb = RingBuffer(100)  # 100 entries default in log ring buffer
+    rb.setLevel(level)
+    logger.addHandler(rb)
+    logger.buffer = rb.buffer
+
+    # overrides
+    logging.getLogger("urllib3").setLevel(logging.ERROR)
+    logging.getLogger("httpx").setLevel(logging.ERROR)
+    logging.getLogger("diffusers").setLevel(logging.ERROR)
+    logging.getLogger("torch").setLevel(logging.ERROR)
+    logging.getLogger("lycoris").handlers = logger.handlers
+
+
+setup_logging()
diff --git a/modules/logits.py b/modules/logits.py
new file mode 100644
index 0000000000000000000000000000000000000000..c630be88a5df1ea8b3ac3a4c58e47e34f6316e4d
--- /dev/null
+++ b/modules/logits.py
@@ -0,0 +1,74 @@
+import torch
+from transformers import is_torch_xpu_available
+
+from modules import sampler_hijack, shared
+from modules.logging_colors import logger
+from modules.text_generation import generate_reply
+
+global_scores = None
+
+
+def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
+    if shared.model is None:
+        logger.error("No model is loaded! Select one in the Model tab.")
+        return 'Error: No model is loaded1 Select one in the Model tab.', previous
+
+    is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
+    is_non_hf_llamacpp = shared.model.__class__.__name__ == 'LlamaCppModel'
+
+    if use_samplers:
+        if any([is_non_hf_exllamav2, is_non_hf_llamacpp]):
+            logger.error("Sampler hijacking is not supported non-Huggingface loaders.")
+            # sampling is all done in c for exllama, so it is really hard to hijack
+            # it should be possible to hijack llamacpp sampler by hijacking all their sampling methods,
+            # but it is not implemented yet
+            return 'Error: Sampler hijacking is not supported non-Huggingface loaders. Please disable the "Use samplers" option.', previous
+
+        state['max_new_tokens'] = 1
+        state['auto_max_new_tokens'] = False
+        for _ in generate_reply(prompt, state):
+            pass
+
+        scores = sampler_hijack.global_scores[-1]
+    else:
+        if is_non_hf_exllamav2:
+            if is_torch_xpu_available():
+                tokens = shared.tokenizer.encode(prompt).to("xpu:0")
+            else:
+                tokens = shared.tokenizer.encode(prompt).cuda()
+            scores = shared.model.get_logits(tokens)[-1][-1]
+        elif is_non_hf_llamacpp:
+            tokens = shared.tokenizer.encode(prompt)
+            scores = shared.model.get_logits(tokens)[-1][-1]
+        else:
+            if is_torch_xpu_available():
+                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0")
+            else:
+                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
+            output = shared.model(input_ids=tokens)
+            scores = output['logits'][-1][-1]
+
+    probs = torch.softmax(scores, dim=-1, dtype=torch.float)
+    topk_values, topk_indices = torch.topk(probs, k=top_logits, largest=True, sorted=True)
+    if is_non_hf_llamacpp:
+        topk_indices = [i.expand((1, 1)) for i in topk_indices]
+
+    if hasattr(shared.tokenizer, 'convert_ids_to_tokens'):
+        tokens = [shared.tokenizer.convert_ids_to_tokens(int(i)) for i in topk_indices]
+    else:
+        tokens = [shared.tokenizer.decode(i) for i in topk_indices]
+
+    if return_dict:
+        topk_values = [float(i) for i in topk_values]
+        output = {}
+        for row in list(zip(topk_values, tokens)):
+            output[row[1]] = row[0]
+
+        return output
+    else:
+        topk_values = [f"{float(i):.5f}" for i in topk_values]
+        output = ''
+        for row in list(zip(topk_values, tokens)):
+            output += f"{row[0]}  -  {repr(row[1])}\n"
+
+        return output, previous
diff --git a/modules/metadata_gguf.py b/modules/metadata_gguf.py
new file mode 100644
index 0000000000000000000000000000000000000000..70ad41dc41251615e1788fcfdbcd1eaaee5b3ce3
--- /dev/null
+++ b/modules/metadata_gguf.py
@@ -0,0 +1,92 @@
+import struct
+from enum import IntEnum
+
+
+class GGUFValueType(IntEnum):
+    UINT8 = 0
+    INT8 = 1
+    UINT16 = 2
+    INT16 = 3
+    UINT32 = 4
+    INT32 = 5
+    FLOAT32 = 6
+    BOOL = 7
+    STRING = 8
+    ARRAY = 9
+    UINT64 = 10
+    INT64 = 11
+    FLOAT64 = 12
+
+
+_simple_value_packing = {
+    GGUFValueType.UINT8: "<B",
+    GGUFValueType.INT8: "<b",
+    GGUFValueType.UINT16: "<H",
+    GGUFValueType.INT16: "<h",
+    GGUFValueType.UINT32: "<I",
+    GGUFValueType.INT32: "<i",
+    GGUFValueType.FLOAT32: "<f",
+    GGUFValueType.UINT64: "<Q",
+    GGUFValueType.INT64: "<q",
+    GGUFValueType.FLOAT64: "<d",
+    GGUFValueType.BOOL: "?",
+}
+
+value_type_info = {
+    GGUFValueType.UINT8: 1,
+    GGUFValueType.INT8: 1,
+    GGUFValueType.UINT16: 2,
+    GGUFValueType.INT16: 2,
+    GGUFValueType.UINT32: 4,
+    GGUFValueType.INT32: 4,
+    GGUFValueType.FLOAT32: 4,
+    GGUFValueType.UINT64: 8,
+    GGUFValueType.INT64: 8,
+    GGUFValueType.FLOAT64: 8,
+    GGUFValueType.BOOL: 1,
+}
+
+
+def get_single(value_type, file):
+    if value_type == GGUFValueType.STRING:
+        value_length = struct.unpack("<Q", file.read(8))[0]
+        value = file.read(value_length)
+        try:
+            value = value.decode('utf-8')
+        except:
+            pass
+    else:
+        type_str = _simple_value_packing.get(value_type)
+        bytes_length = value_type_info.get(value_type)
+        value = struct.unpack(type_str, file.read(bytes_length))[0]
+
+    return value
+
+
+def load_metadata(fname):
+    metadata = {}
+    with open(fname, 'rb') as file:
+        GGUF_MAGIC = struct.unpack("<I", file.read(4))[0]
+        GGUF_VERSION = struct.unpack("<I", file.read(4))[0]
+        ti_data_count = struct.unpack("<Q", file.read(8))[0]
+        kv_data_count = struct.unpack("<Q", file.read(8))[0]
+
+        if GGUF_VERSION == 1:
+            raise Exception('You are using an outdated GGUF, please download a new one.')
+
+        for i in range(kv_data_count):
+            key_length = struct.unpack("<Q", file.read(8))[0]
+            key = file.read(key_length)
+
+            value_type = GGUFValueType(struct.unpack("<I", file.read(4))[0])
+            if value_type == GGUFValueType.ARRAY:
+                ltype = GGUFValueType(struct.unpack("<I", file.read(4))[0])
+                length = struct.unpack("<Q", file.read(8))[0]
+
+                arr = [get_single(ltype, file) for _ in range(length)]
+                metadata[key.decode()] = arr
+            else:
+                value = get_single(value_type, file)
+                metadata[key.decode()] = value
+
+    return metadata
diff --git a/modules/models.py b/modules/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..5929e868b3276663d6477f30c5d585370a6eb8da
--- /dev/null
+++ b/modules/models.py
@@ -0,0 +1,453 @@
+import gc
+import logging
+import os
+import re
+import time
+import traceback
+from pathlib import Path
+
+import torch
+import transformers
+from accelerate import infer_auto_device_map, init_empty_weights
+from accelerate.utils import is_ccl_available, is_xpu_available
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    GPTQConfig
+)
+
+import modules.shared as shared
+from modules import RoPE, sampler_hijack
+from modules.logging_colors import logger
+from modules.models_settings import get_model_metadata
+from modules.relative_imports import RelativeImport
+
+transformers.logging.set_verbosity_error()
+
+local_rank = None
+if shared.args.deepspeed:
+    import deepspeed
+    from transformers.deepspeed import (
+        HfDeepSpeedConfig,
+        is_deepspeed_zero3_enabled
+    )
+
+    from modules.deepspeed_parameters import generate_ds_config
+
+    # Distributed setup
+    local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    if is_xpu_available() and is_ccl_available():
+        torch.xpu.set_device(local_rank)
+        deepspeed.init_distributed(backend="ccl")
+    else:
+        torch.cuda.set_device(local_rank)
+        deepspeed.init_distributed()
+    ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
+    dschf = HfDeepSpeedConfig(ds_config)  # Keep this object alive for the Transformers integration
+
+sampler_hijack.hijack_samplers()
+
+
+def load_model(model_name, loader=None):
+    logger.info(f"Loading {model_name}")
+    t0 = time.time()
+
+    shared.is_seq2seq = False
+    shared.model_name = model_name
+    load_func_map = {
+        'Transformers': huggingface_loader,
+        'AutoGPTQ': AutoGPTQ_loader,
+        'GPTQ-for-LLaMa': GPTQ_loader,
+        'llama.cpp': llamacpp_loader,
+        'llamacpp_HF': llamacpp_HF_loader,
+        'ExLlamav2': ExLlamav2_loader,
+        'ExLlamav2_HF': ExLlamav2_HF_loader,
+        'ctransformers': ctransformers_loader,
+        'AutoAWQ': AutoAWQ_loader,
+        'QuIP#': QuipSharp_loader,
+        'HQQ': HQQ_loader,
+    }
+
+    metadata = get_model_metadata(model_name)
+    if loader is None:
+        if shared.args.loader is not None:
+            loader = shared.args.loader
+        else:
+            loader = metadata['loader']
+            if loader is None:
+                logger.error('The path to the model does not exist. Exiting.')
+                raise ValueError
+
+    shared.args.loader = loader
+    output = load_func_map[loader](model_name)
+    if type(output) is tuple:
+        model, tokenizer = output
+    else:
+        model = output
+        if model is None:
+            return None, None
+        else:
+            tokenizer = load_tokenizer(model_name, model)
+
+    shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
+    if loader.lower().startswith('exllama'):
+        shared.settings['truncation_length'] = shared.args.max_seq_len
+    elif loader in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
+        shared.settings['truncation_length'] = shared.args.n_ctx
+
+    logger.info(f"LOADER: \"{loader}\"")
+    logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}")
+    logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"")
+    logger.info(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
+    return model, tokenizer
+
+
+def load_tokenizer(model_name, model):
+    tokenizer = None
+    path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
+    if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
+        tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
+    elif path_to_model.exists():
+        if shared.args.no_use_fast:
+            logger.info('Loading the tokenizer with use_fast=False.')
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            path_to_model,
+            trust_remote_code=shared.args.trust_remote_code,
+            use_fast=not shared.args.no_use_fast
+        )
+
+    return tokenizer
+
+
+def huggingface_loader(model_name):
+    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
+    params = {
+        'low_cpu_mem_usage': True,
+        'trust_remote_code': shared.args.trust_remote_code,
+        'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
+        'use_safetensors': True if shared.args.force_safetensors else None
+    }
+
+    if shared.args.use_flash_attention_2:
+        params['use_flash_attention_2'] = True
+
+    config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=params['trust_remote_code'])
+
+    if 'chatglm' in model_name.lower():
+        LoaderClass = AutoModel
+    else:
+        if config.to_dict().get('is_encoder_decoder', False):
+            LoaderClass = AutoModelForSeq2SeqLM
+            shared.is_seq2seq = True
+        else:
+            LoaderClass = AutoModelForCausalLM
+
+    # Load the model in simple 16-bit mode by default
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.compress_pos_emb > 1, shared.args.alpha_value > 1, shared.args.disable_exllama, shared.args.disable_exllamav2]):
+        model = LoaderClass.from_pretrained(path_to_model, **params)
+        if torch.backends.mps.is_available():
+            device = torch.device('mps')
+            model = model.to(device)
+        elif is_xpu_available():
+            device = torch.device("xpu")
+            model = model.to(device)
+        else:
+            model = model.cuda()
+
+    # DeepSpeed ZeRO-3
+    elif shared.args.deepspeed:
+        model = LoaderClass.from_pretrained(path_to_model, torch_dtype=params['torch_dtype'], trust_remote_code=params['trust_remote_code'])
+        model = deepspeed.initialize(model=model, config_params=ds_config, model_parameters=None, optimizer=None, lr_scheduler=None)[0]
+        model.module.eval()  # Inference
+        logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
+
+    # Load with quantization and/or offloading
+    else:
+        if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
+            logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')
+            shared.args.cpu = True
+
+        if shared.args.cpu:
+            params['torch_dtype'] = torch.float32
+        else:
+            params['device_map'] = 'auto'
+            params['max_memory'] = get_max_memory_dict()
+            if shared.args.load_in_4bit:
+                # See https://github.com/huggingface/transformers/pull/23479/files
+                # and https://huggingface.co/blog/4bit-transformers-bitsandbytes
+                quantization_config_params = {
+                    'load_in_4bit': True,
+                    'bnb_4bit_compute_dtype': eval("torch.{}".format(shared.args.compute_dtype)) if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
+                    'bnb_4bit_quant_type': shared.args.quant_type,
+                    'bnb_4bit_use_double_quant': shared.args.use_double_quant,
+                }
+
+                logger.info('Using the following 4-bit params: ' + str(quantization_config_params))
+                params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
+
+            elif shared.args.load_in_8bit:
+                if any((shared.args.auto_devices, shared.args.gpu_memory)):
+                    params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
+                else:
+                    params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
+
+                if params['max_memory'] is not None:
+                    with init_empty_weights():
+                        model = LoaderClass.from_config(config, trust_remote_code=params['trust_remote_code'])
+
+                    model.tie_weights()
+                    params['device_map'] = infer_auto_device_map(
+                        model,
+                        dtype=torch.int8,
+                        max_memory=params['max_memory'],
+                        no_split_module_classes=model._no_split_modules
+                    )
+
+            if shared.args.disk:
+                params['offload_folder'] = shared.args.disk_cache_dir
+
+        if shared.args.disable_exllama or shared.args.disable_exllamav2:
+            try:
+                gptq_config = GPTQConfig(
+                    bits=config.quantization_config.get('bits', 4),
+                    disable_exllama=shared.args.disable_exllama,
+                    disable_exllamav2=shared.args.disable_exllamav2,
+                )
+
+                params['quantization_config'] = gptq_config
+                logger.info(f'Loading with disable_exllama={shared.args.disable_exllama} and disable_exllamav2={shared.args.disable_exllamav2}.')
+            except:
+                exc = traceback.format_exc()
+                logger.error('Failed to disable exllama. Does the config.json for this model contain the necessary quantization info?')
+                print(exc)
+
+        if shared.args.compress_pos_emb > 1:
+            params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
+        elif shared.args.alpha_value > 1:
+            params['rope_scaling'] = {'type': 'dynamic', 'factor': RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)}
+
+        model = LoaderClass.from_pretrained(path_to_model, **params)
+
+    return model
+
+
+def llamacpp_loader(model_name):
+    from modules.llamacpp_model import LlamaCppModel
+
+    path = Path(f'{shared.args.model_dir}/{model_name}')
+    if path.is_file():
+        model_file = path
+    else:
+        model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
+
+    logger.info(f"llama.cpp weights detected: {model_file}")
+    model, tokenizer = LlamaCppModel.from_pretrained(model_file)
+    return model, tokenizer
+
+
+def llamacpp_HF_loader(model_name):
+    from modules.llamacpp_hf import LlamacppHF
+
+    for fname in [model_name, "oobabooga_llama-tokenizer", "llama-tokenizer"]:
+        path = Path(f'{shared.args.model_dir}/{fname}')
+        if all((path / file).exists() for file in ['tokenizer_config.json', 'special_tokens_map.json', 'tokenizer.model']):
+            logger.info(f'Using tokenizer from: {path}')
+            break
+    else:
+        logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
+        return None, None
+
+    if shared.args.no_use_fast:
+        logger.info('Loading the tokenizer with use_fast=False.')
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        path,
+        trust_remote_code=shared.args.trust_remote_code,
+        use_fast=not shared.args.no_use_fast
+    )
+
+    model = LlamacppHF.from_pretrained(model_name)
+    return model, tokenizer
+
+
+def ctransformers_loader(model_name):
+    from modules.ctransformers_model import CtransformersModel
+
+    path = Path(f'{shared.args.model_dir}/{model_name}')
+    ctrans = CtransformersModel()
+    if ctrans.model_type_is_auto():
+        model_file = path
+    else:
+        if path.is_file():
+            model_file = path
+        else:
+            entries = Path(f'{shared.args.model_dir}/{model_name}')
+            gguf = list(entries.glob('*.gguf'))
+            bin = list(entries.glob('*.bin'))
+            if len(gguf) > 0:
+                model_file = gguf[0]
+            elif len(bin) > 0:
+                model_file = bin[0]
+            else:
+                logger.error("Could not find a model for ctransformers.")
+                return None, None
+
+    logger.info(f'ctransformers weights detected: {model_file}')
+    model, tokenizer = ctrans.from_pretrained(model_file)
+    return model, tokenizer
+
+
+def AutoAWQ_loader(model_name):
+    from awq import AutoAWQForCausalLM
+
+    model_dir = Path(f'{shared.args.model_dir}/{model_name}')
+
+    model = AutoAWQForCausalLM.from_quantized(
+        quant_path=model_dir,
+        max_new_tokens=shared.args.max_seq_len,
+        trust_remote_code=shared.args.trust_remote_code,
+        fuse_layers=not shared.args.no_inject_fused_attention,
+        max_memory=get_max_memory_dict(),
+        batch_size=1,
+        safetensors=any(model_dir.glob('*.safetensors')),
+    )
+
+    return model
+
+
+def QuipSharp_loader(model_name):
+    try:
+        with RelativeImport("repositories/quip-sharp"):
+            from lib.utils.unsafe_import import model_from_hf_path
+    except:
+        logger.error(
+            "\nQuIP# has not been found. It must be installed manually for now.\n"
+            "For instructions on how to do that, please consult:\n"
+            "https://github.com/oobabooga/text-generation-webui/pull/4803\n"
+        )
+        return None, None
+
+    # This fixes duplicate logging messages after the import above.
+    handlers = logging.getLogger().handlers
+    if len(handlers) > 1:
+        logging.getLogger().removeHandler(handlers[1])
+
+    model_dir = Path(f'{shared.args.model_dir}/{model_name}')
+    if not all((model_dir / file).exists() for file in ['tokenizer_config.json', 'special_tokens_map.json', 'tokenizer.model']):
+        logger.error(f"Could not load the model because the tokenizer files could not be found in the model folder. Please download the following files from the original (unquantized) model into {model_dir}: special_tokens_map.json, tokenizer.json, tokenizer.model, tokenizer_config.json.")
+        return None, None
+
+    model, model_str = model_from_hf_path(
+        model_dir,
+        use_cuda_graph=False,
+        use_flash_attn=not shared.args.no_flash_attn
+    )
+
+    return model
+
+
+def GPTQ_loader(model_name):
+
+    # Monkey patch
+    if shared.args.monkey_patch:
+        logger.warning("Applying the monkey patch for using LoRAs with GPTQ models. It may cause undefined behavior outside its intended scope.")
+        from modules.monkey_patch_gptq_lora import load_model_llama
+
+        model, _ = load_model_llama(model_name)
+
+    # No monkey patch
+    else:
+        import modules.GPTQ_loader
+
+        model = modules.GPTQ_loader.load_quantized(model_name)
+
+    return model
+
+
+def AutoGPTQ_loader(model_name):
+    import modules.AutoGPTQ_loader
+
+    return modules.AutoGPTQ_loader.load_quantized(model_name)
+
+
+def ExLlamav2_loader(model_name):
+    from modules.exllamav2 import Exllamav2Model
+
+    model, tokenizer = Exllamav2Model.from_pretrained(model_name)
+    return model, tokenizer
+
+
+def ExLlamav2_HF_loader(model_name):
+    from modules.exllamav2_hf import Exllamav2HF
+
+    return Exllamav2HF.from_pretrained(model_name)
+
+
+def HQQ_loader(model_name):
+    from hqq.core.quantize import HQQBackend, HQQLinear
+    from hqq.engine.hf import HQQModelForCausalLM
+
+    logger.info(f"Loading HQQ model with backend: {shared.args.hqq_backend}")
+
+    model_dir = Path(f'{shared.args.model_dir}/{model_name}')
+    model = HQQModelForCausalLM.from_quantized(str(model_dir))
+    HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend))
+    return model
+
+
+def get_max_memory_dict():
+    max_memory = {}
+    max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
+    if shared.args.gpu_memory:
+        memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
+        for i in range(len(memory_map)):
+            max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
+
+        max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
+
+    # If --auto-devices is provided standalone, try to get a reasonable value
+    # for the maximum memory of device :0
+    elif shared.args.auto_devices:
+        if is_xpu_available():
+            total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024))
+        else:
+            total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
+
+        suggestion = round((total_mem - 1000) / 1000) * 1000
+        if total_mem - suggestion < 800:
+            suggestion -= 1000
+
+        suggestion = int(round(suggestion / 1000))
+        logger.warning(f"Auto-assiging --gpu-memory {suggestion} for your GPU to try to prevent out-of-memory errors. You can manually set other values.")
+        max_memory[0] = f'{suggestion}GiB'
+        max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
+
+    return max_memory if len(max_memory) > 0 else None
+
+
+def clear_torch_cache():
+    gc.collect()
+    if not shared.args.cpu:
+        if is_xpu_available():
+            torch.xpu.empty_cache()
+        else:
+            torch.cuda.empty_cache()
+
+
+def unload_model():
+    shared.model = shared.tokenizer = None
+    shared.model_name = 'None'
+    shared.lora_names = []
+    shared.model_dirty_from_training = False
+    clear_torch_cache()
+
+
+def reload_model():
+    unload_model()
+    shared.model, shared.tokenizer = load_model(shared.model_name)
diff --git a/modules/models_settings.py b/modules/models_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..9acc7efa3bb4ed714da8dabc0cd306721cf78fd2
--- /dev/null
+++ b/modules/models_settings.py
@@ -0,0 +1,269 @@
+import json
+import re
+from pathlib import Path
+
+import yaml
+
+from modules import chat, loaders, metadata_gguf, shared, ui
+
+
+def get_fallback_settings():
+    return {
+        'wbits': 'None',
+        'groupsize': 'None',
+        'desc_act': False,
+        'model_type': 'None',
+        'max_seq_len': 2048,
+        'n_ctx': 2048,
+        'rope_freq_base': 0,
+        'compress_pos_emb': 1,
+        'truncation_length': shared.settings['truncation_length'],
+        'skip_special_tokens': shared.settings['skip_special_tokens'],
+        'custom_stopping_strings': shared.settings['custom_stopping_strings'],
+    }
+
+
+def get_model_metadata(model):
+    model_settings = {}
+
+    # Get settings from models/config.yaml and models/config-user.yaml
+    settings = shared.model_config
+    for pat in settings:
+        if re.match(pat.lower(), model.lower()):
+            for k in settings[pat]:
+                model_settings[k] = settings[pat][k]
+
+    path = Path(f'{shared.args.model_dir}/{model}/config.json')
+    if path.exists():
+        hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read())
+    else:
+        hf_metadata = None
+
+    if 'loader' not in model_settings:
+        if hf_metadata is not None and 'quip_params' in hf_metadata:
+            loader = 'QuIP#'
+        else:
+            loader = infer_loader(model, model_settings)
+
+        model_settings['loader'] = loader
+
+    # GGUF metadata
+    if model_settings['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
+        path = Path(f'{shared.args.model_dir}/{model}')
+        if path.is_file():
+            model_file = path
+        else:
+            model_file = list(path.glob('*.gguf'))[0]
+
+        metadata = metadata_gguf.load_metadata(model_file)
+        if 'llama.context_length' in metadata:
+            model_settings['n_ctx'] = metadata['llama.context_length']
+        if 'llama.rope.scale_linear' in metadata:
+            model_settings['compress_pos_emb'] = metadata['llama.rope.scale_linear']
+        if 'llama.rope.freq_base' in metadata:
+            model_settings['rope_freq_base'] = metadata['llama.rope.freq_base']
+        if 'tokenizer.chat_template' in metadata:
+            template = metadata['tokenizer.chat_template']
+            eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']]
+            bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']]
+            template = template.replace('eos_token', "'{}'".format(eos_token))
+            template = template.replace('bos_token', "'{}'".format(bos_token))
+
+            template = re.sub(r'raise_exception\([^)]*\)', "''", template)
+            model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
+            model_settings['instruction_template_str'] = template
+
+    else:
+        # Transformers metadata
+        if hf_metadata is not None:
+            metadata = json.loads(open(path, 'r', encoding='utf-8').read())
+            if 'max_position_embeddings' in metadata:
+                model_settings['truncation_length'] = metadata['max_position_embeddings']
+                model_settings['max_seq_len'] = metadata['max_position_embeddings']
+
+            if 'rope_theta' in metadata:
+                model_settings['rope_freq_base'] = metadata['rope_theta']
+
+            if 'rope_scaling' in metadata and type(metadata['rope_scaling']) is dict and all(key in metadata['rope_scaling'] for key in ('type', 'factor')):
+                if metadata['rope_scaling']['type'] == 'linear':
+                    model_settings['compress_pos_emb'] = metadata['rope_scaling']['factor']
+
+            if 'quantization_config' in metadata:
+                if 'bits' in metadata['quantization_config']:
+                    model_settings['wbits'] = metadata['quantization_config']['bits']
+                if 'group_size' in metadata['quantization_config']:
+                    model_settings['groupsize'] = metadata['quantization_config']['group_size']
+                if 'desc_act' in metadata['quantization_config']:
+                    model_settings['desc_act'] = metadata['quantization_config']['desc_act']
+
+        # Read AutoGPTQ metadata
+        path = Path(f'{shared.args.model_dir}/{model}/quantize_config.json')
+        if path.exists():
+            metadata = json.loads(open(path, 'r', encoding='utf-8').read())
+            if 'bits' in metadata:
+                model_settings['wbits'] = metadata['bits']
+            if 'group_size' in metadata:
+                model_settings['groupsize'] = metadata['group_size']
+            if 'desc_act' in metadata:
+                model_settings['desc_act'] = metadata['desc_act']
+
+    # Try to find the Jinja instruct template
+    path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
+    if path.exists():
+        metadata = json.loads(open(path, 'r', encoding='utf-8').read())
+        if 'chat_template' in metadata:
+            template = metadata['chat_template']
+            for k in ['eos_token', 'bos_token']:
+                if k in metadata:
+                    value = metadata[k]
+                    if type(value) is dict:
+                        value = value['content']
+
+                    template = template.replace(k, "'{}'".format(value))
+
+            template = re.sub(r'raise_exception\([^)]*\)', "''", template)
+            model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
+            model_settings['instruction_template_str'] = template
+
+    if 'instruction_template' not in model_settings:
+        model_settings['instruction_template'] = 'Alpaca'
+
+    if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
+        model_settings['instruction_template_str'] = chat.load_instruction_template(model_settings['instruction_template'])
+
+    # Ignore rope_freq_base if set to the default value
+    if 'rope_freq_base' in model_settings and model_settings['rope_freq_base'] == 10000:
+        model_settings.pop('rope_freq_base')
+
+    # Apply user settings from models/config-user.yaml
+    settings = shared.user_config
+    for pat in settings:
+        if re.match(pat.lower(), model.lower()):
+            for k in settings[pat]:
+                model_settings[k] = settings[pat][k]
+
+    return model_settings
+
+
+def infer_loader(model_name, model_settings):
+    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
+    if not path_to_model.exists():
+        loader = None
+    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
+        loader = 'ExLlamav2_HF'
+    elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
+        loader = 'AutoAWQ'
+    elif len(list(path_to_model.glob('*.gguf'))) > 0:
+        loader = 'llama.cpp'
+    elif re.match(r'.*\.gguf', model_name.lower()):
+        loader = 'llama.cpp'
+    elif re.match(r'.*exl2', model_name.lower()):
+        loader = 'ExLlamav2_HF'
+    elif re.match(r'.*-hqq', model_name.lower()):
+        return 'HQQ'
+    else:
+        loader = 'Transformers'
+
+    return loader
+
+
+def update_model_parameters(state, initial=False):
+    '''
+    UI: update the command-line arguments based on the interface values
+    '''
+    elements = ui.list_model_elements()  # the names of the parameters
+    gpu_memories = []
+
+    for i, element in enumerate(elements):
+        if element not in state:
+            continue
+
+        value = state[element]
+        if element.startswith('gpu_memory'):
+            gpu_memories.append(value)
+            continue
+
+        if initial and element in shared.provided_arguments:
+            continue
+
+        # Setting null defaults
+        if element in ['wbits', 'groupsize', 'model_type'] and value == 'None':
+            value = vars(shared.args_defaults)[element]
+        elif element in ['cpu_memory'] and value == 0:
+            value = vars(shared.args_defaults)[element]
+
+        # Making some simple conversions
+        if element in ['wbits', 'groupsize', 'pre_layer']:
+            value = int(value)
+        elif element == 'cpu_memory' and value is not None:
+            value = f"{value}MiB"
+
+        if element in ['pre_layer']:
+            value = [value] if value > 0 else None
+
+        setattr(shared.args, element, value)
+
+    found_positive = False
+    for i in gpu_memories:
+        if i > 0:
+            found_positive = True
+            break
+
+    if not (initial and vars(shared.args)['gpu_memory'] != vars(shared.args_defaults)['gpu_memory']):
+        if found_positive:
+            shared.args.gpu_memory = [f"{i}MiB" for i in gpu_memories]
+        else:
+            shared.args.gpu_memory = None
+
+
+def apply_model_settings_to_state(model, state):
+    '''
+    UI: update the state variable with the model settings
+    '''
+    model_settings = get_model_metadata(model)
+    if 'loader' in model_settings:
+        loader = model_settings.pop('loader')
+
+        # If the user is using an alternative loader for the same model type, let them keep using it
+        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']) and not (loader == 'llama.cpp' and state['loader'] in ['llamacpp_HF', 'ctransformers']):
+            state['loader'] = loader
+
+    for k in model_settings:
+        if k in state:
+            if k in ['wbits', 'groupsize']:
+                state[k] = str(model_settings[k])
+            else:
+                state[k] = model_settings[k]
+
+    return state
+
+
+def save_model_settings(model, state):
+    '''
+    Save the settings for this model to models/config-user.yaml
+    '''
+    if model == 'None':
+        yield ("Not saving the settings because no model is loaded.")
+        return
+
+    with Path(f'{shared.args.model_dir}/config-user.yaml') as p:
+        if p.exists():
+            user_config = yaml.safe_load(open(p, 'r').read())
+        else:
+            user_config = {}
+
+        model_regex = model + '$'  # For exact matches
+        if model_regex not in user_config:
+            user_config[model_regex] = {}
+
+        for k in ui.list_model_elements():
+            if k == 'loader' or k in loaders.loaders_and_params[state['loader']]:
+                user_config[model_regex][k] = state[k]
+
+        shared.user_config = user_config
+
+        output = yaml.dump(user_config, sort_keys=False)
+        with open(p, 'w') as f:
+            f.write(output)
+
+        yield (f"Settings for `{model}` saved to `{p}`.")
diff --git a/modules/monkey_patch_gptq_lora.py b/modules/monkey_patch_gptq_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..3166bd33ceba449cb542861b0238818f68c7b02e
--- /dev/null
+++ b/modules/monkey_patch_gptq_lora.py
@@ -0,0 +1,39 @@
+# Copied from https://github.com/johnsmith0031/alpaca_lora_4bit
+
+from pathlib import Path
+
+import alpaca_lora_4bit.autograd_4bit as autograd_4bit
+from alpaca_lora_4bit.amp_wrapper import AMPWrapper
+from alpaca_lora_4bit.autograd_4bit import (
+    Autograd4bitQuantLinear,
+    load_llama_model_4bit_low_ram
+)
+from alpaca_lora_4bit.models import Linear4bitLt
+from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
+    replace_peft_model_with_int4_lora_model
+)
+
+from modules import shared
+from modules.GPTQ_loader import find_quantized_model_file
+
+replace_peft_model_with_int4_lora_model()
+
+
+def load_model_llama(model_name):
+    config_path = str(Path(f'{shared.args.model_dir}/{model_name}'))
+    model_path = str(find_quantized_model_file(model_name))
+    model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=shared.args.groupsize, is_v1_model=False)
+    for _, m in model.named_modules():
+        if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
+            if m.is_v1_model:
+                m.zeros = m.zeros.half()
+            m.scales = m.scales.half()
+            m.bias = m.bias.half()
+
+    autograd_4bit.auto_switch = True
+
+    model.half()
+    wrapper = AMPWrapper(model)
+    wrapper.apply_generate()
+
+    return model, tokenizer
diff --git a/modules/one_click_installer_check.py b/modules/one_click_installer_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bde860094f5e9c1ad5e3293502c3606a9f4a130
--- /dev/null
+++ b/modules/one_click_installer_check.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+from modules.logging_colors import logger
+
+if Path('../webui.py').exists():
+    logger.warning('\nIt looks like you are running an outdated version of '
+                   'the one-click-installers.\n'
+                   'Please migrate your installation following the instructions here:\n'
+                   'https://github.com/oobabooga/text-generation-webui/wiki/Migrating-an-old-one%E2%80%90click-install')
diff --git a/modules/presets.py b/modules/presets.py
new file mode 100644
index 0000000000000000000000000000000000000000..2686355d8e974f8ee89b226f60f6c2bfafb52463
--- /dev/null
+++ b/modules/presets.py
@@ -0,0 +1,131 @@
+import functools
+import random
+from pathlib import Path
+
+import yaml
+
+from modules import shared
+from modules.loaders import loaders_samplers
+from modules.logging_colors import logger
+
+
+def default_preset():
+    return {
+        'temperature': 1,
+        'temperature_last': False,
+        'dynamic_temperature': False,
+        'dynatemp_low': 1,
+        'dynatemp_high': 1,
+        'dynatemp_exponent': 1,
+        'smoothing_factor': 0,
+        'top_p': 1,
+        'min_p': 0,
+        'top_k': 0,
+        'repetition_penalty': 1,
+        'presence_penalty': 0,
+        'frequency_penalty': 0,
+        'repetition_penalty_range': 1024,
+        'typical_p': 1,
+        'tfs': 1,
+        'top_a': 0,
+        'epsilon_cutoff': 0,
+        'eta_cutoff': 0,
+        'guidance_scale': 1,
+        'penalty_alpha': 0,
+        'mirostat_mode': 0,
+        'mirostat_tau': 5,
+        'mirostat_eta': 0.1,
+        'do_sample': True,
+        'encoder_repetition_penalty': 1,
+        'no_repeat_ngram_size': 0,
+        'min_length': 0,
+        'num_beams': 1,
+        'length_penalty': 1,
+        'early_stopping': False,
+        'sampler_priority': 'temperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat'
+    }
+
+
+def presets_params():
+    return [k for k in default_preset()]
+
+
+def load_preset(name):
+    generate_params = default_preset()
+    if name not in ['None', None, '']:
+        path = Path(f'presets/{name}.yaml')
+        if path.exists():
+            with open(path, 'r') as infile:
+                preset = yaml.safe_load(infile)
+
+            for k in preset:
+                generate_params[k] = preset[k]
+        else:
+            logger.error(f"The preset \"{name}\" does not exist under \"{path}\". Using the default parameters.")
+
+    return generate_params
+
+
+@functools.cache
+def load_preset_memoized(name):
+    return load_preset(name)
+
+
+def load_preset_for_ui(name, state):
+    generate_params = load_preset(name)
+    state.update(generate_params)
+    return state, *[generate_params[k] for k in presets_params()]
+
+
+def random_preset(state):
+    params_and_values = {
+        'remove_tail_tokens': {
+            'top_p': [0.5, 0.8, 0.9, 0.95, 0.99],
+            'min_p': [0.5, 0.2, 0.1, 0.05, 0.01],
+            'top_k': [3, 5, 10, 20, 30, 40],
+            'typical_p': [0.2, 0.575, 0.95],
+            'tfs': [0.5, 0.8, 0.9, 0.95, 0.99],
+            'top_a': [0.5, 0.2, 0.1, 0.05, 0.01],
+            'epsilon_cutoff': [1, 3, 5, 7, 9],
+            'eta_cutoff': [3, 6, 9, 12, 15, 18],
+        },
+        'flatten_distribution': {
+            'temperature': [0.5, 0.7, 0.8, 1, 1.2, 1.5, 2.0],
+        },
+        'repetition': {
+            'repetition_penalty': [1, 1.05, 1.1, 1.15, 1.20, 1.25],
+            'presence_penalty': [0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0],
+            'frequency_penalty': [0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0],
+        },
+        'other': {
+            'temperature_last': [True, False],
+        }
+    }
+
+    generate_params = default_preset()
+    for cat in params_and_values:
+        choices = list(params_and_values[cat].keys())
+        if shared.args.loader is not None:
+            choices = [x for x in choices if x in loaders_samplers[shared.args.loader]]
+
+        if len(choices) > 0:
+            choice = random.choice(choices)
+            generate_params[choice] = random.choice(params_and_values[cat][choice])
+
+    state.update(generate_params)
+    return state, *[generate_params[k] for k in presets_params()]
+
+
+def generate_preset_yaml(state):
+    defaults = default_preset()
+    data = {k: state[k] for k in presets_params()}
+
+    # Remove entries that are identical to the defaults.
+    # sampler_priority is always saved because it is experimental
+    # and the default order may change.
+
+    for k in list(data.keys()):
+        if data[k] == defaults[k] and k != 'sampler_priority':
+            del data[k]
+
+    return yaml.dump(data, sort_keys=False)
diff --git a/modules/prompts.py b/modules/prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..565c245079d1824c6e96fd7ee2075c573f2081ac
--- /dev/null
+++ b/modules/prompts.py
@@ -0,0 +1,27 @@
+from pathlib import Path
+
+from modules.text_generation import get_encoded_length
+
+
+def load_prompt(fname):
+    if fname in ['None', '']:
+        return ''
+    else:
+        file_path = Path(f'prompts/{fname}.txt')
+        if not file_path.exists():
+            return ''
+
+        with open(file_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+            if text[-1] == '\n':
+                text = text[:-1]
+
+            return text
+
+
+def count_tokens(text):
+    try:
+        tokens = get_encoded_length(text)
+        return str(tokens)
+    except:
+        return '0'
diff --git a/modules/relative_imports.py b/modules/relative_imports.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c0eb56b77c6cb6b38fdbdeebabe9ad3b8d91b97
--- /dev/null
+++ b/modules/relative_imports.py
@@ -0,0 +1,13 @@
+import sys
+from pathlib import Path
+
+
+class RelativeImport:
+    def __init__(self, path):
+        self.import_path = Path(path)
+
+    def __enter__(self):
+        sys.path.insert(0, str(self.import_path))
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        sys.path.remove(str(self.import_path))
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f8de4168882e493275f93a3a9a05b9ce8d0c3cb
--- /dev/null
+++ b/modules/sampler_hijack.py
@@ -0,0 +1,490 @@
+import math
+import pprint
+
+import torch
+import transformers
+from transformers import LogitsWarper, is_torch_xpu_available
+from transformers.generation.logits_process import (
+    LogitNormalization,
+    LogitsProcessor,
+    LogitsProcessorList
+)
+
+from modules import shared
+from modules.logging_colors import logger
+
+global_scores = None
+
+
+class TemperatureLogitsWarperCustom(LogitsWarper):
+    '''
+    A copy of the original Transformers temperature logits warper.
+    '''
+
+    def __init__(self, temperature: float):
+        if not isinstance(temperature, float) or not (temperature > 0):
+            except_msg = (
+                f"`temperature` (={temperature}) has to be a strictly positive float, otherwise your next token "
+                "scores will be invalid."
+            )
+            if isinstance(temperature, float) and temperature == 0.0:
+                except_msg += " If you're looking for greedy decoding strategies, set `do_sample=False`."
+
+            raise ValueError(except_msg)
+
+        self.temperature = temperature
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        scores = scores / self.temperature
+        return scores
+
+
+class DynamicTemperatureLogitsWarper(LogitsWarper):
+    '''
+    Dynamic temperature.
+    '''
+
+    def __init__(self, dynatemp_low: float, dynatemp_high: float, dynatemp_exponent: float):
+        self.dynatemp_low = dynatemp_low
+        self.dynatemp_high = dynatemp_high
+        self.dynatemp_exponent = dynatemp_exponent
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        min_temp = self.dynatemp_low
+        max_temp = self.dynatemp_high
+        exponent_val = self.dynatemp_exponent
+
+        # Convert logits to probabilities
+        probs = torch.softmax(scores, dim=-1)
+
+        # Calculate entropy of the softmax probabilities
+        entropy = -1.0 * torch.where(probs > 0, probs * torch.log(probs), torch.zeros_like(probs)).sum()
+
+        # Guard against future possible division by zero
+        entropy = max(entropy, torch.tensor(1e-10))  # Ensures entropy is slightly greater than 0
+
+        # Any logits which are not -Infinity will be considered for calculating max entropy.
+        num_valid_tokens = torch.sum(scores > -float('inf')).item()
+
+        # Now, calculate the max entropy by using only the valid tokens' count
+        max_entropy = math.log(num_valid_tokens)
+
+        # Guard against future possible division by zero
+        max_entropy = max_entropy if max_entropy > 0.0 else 1e-10
+
+        # Normalize the entropy
+        normalized_entropy = entropy / max_entropy
+
+        # Map the normalized entropy to the desired temperature range using the power function
+        dyn_temp = min_temp + (max_temp - min_temp) * (normalized_entropy.pow(exponent_val))
+
+        # Apply the dynamically calculated temperature scaling
+        scores = scores / dyn_temp
+
+        # print("----------------------\nTemperature from generation_config:", self.temperature)
+        # print("min_temp:", min_temp)
+        # print("max_temp:", max_temp)
+        # print("Entropy:", entropy.item())
+        # print("Max Possible Entropy considering valid tokens only:", max_entropy)
+        # print("Normalized Entropy:", normalized_entropy.item())
+        # print("Dynamic Temperature (dyn_temp):", dyn_temp.item())
+        # print("----------------------")
+
+        # max_prob_token_id = torch.argmax(scores, dim=-1)  # Get the token ID with the highest probability
+        # max_prob_token = shared.tokenizer.convert_ids_to_tokens(int(max_prob_token_id))  # Convert ID to token
+        # print("--- T=", float(dyn_temp), "token=", max_prob_token, "min=", min_temp, "max=", max_temp, "exponent=", exponent_val)
+
+        return scores
+
+
+class QuadraticSamplingLogitsWarper(LogitsWarper):
+    '''
+    Quadratic sampling.
+    '''
+
+    def __init__(self, smoothing_factor: float):
+        self.smoothing_factor = smoothing_factor
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # Compute the maximum logit value
+        max_logit = scores.max()
+
+        # Apply the quadratic transformation
+        transformed_logits = -(self.smoothing_factor * (scores - max_logit)**2) + max_logit
+
+        # No need to print the top 5 logits since this is not required
+        # print("Original top 5 logits: ", torch.topk(scores, 5))
+        # print("New top 5 logits: ", torch.topk(transformed_logits, 5))
+
+        return transformed_logits
+
+
+class MinPLogitsWarper(LogitsWarper):
+    def __init__(self, min_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        if min_p < 0 or min_p > 1.0:
+            raise ValueError(f"`min_p` has to be a float >= 0 and <= 1, but is {min_p}")
+        self.min_p = min_p
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # Convert logits to probabilities
+        probs = torch.softmax(scores, dim=-1)
+        # Get the probability of the top token for each sequence in the batch
+        top_probs, _ = probs.max(dim=-1, keepdim=True)
+        # Calculate the actual min_p threshold by scaling min_p with the top token's probability
+        scaled_min_p = self.min_p * top_probs
+        # Create a mask for tokens that have a probability less than the scaled min_p
+        tokens_to_remove = probs < scaled_min_p
+
+        sorted_indices = torch.argsort(scores, descending=True, dim=-1)
+        sorted_indices_to_remove = torch.gather(tokens_to_remove, dim=-1, index=sorted_indices)
+
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep
+            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = False
+
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+class TailFreeLogitsWarper(LogitsWarper):
+    def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        tfs = float(tfs)
+        if tfs < 0 or tfs > 1.0:
+            raise ValueError(f"`tfs` has to be a float >= 0 and <= 1, but is {tfs}")
+        self.tfs = tfs
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        sorted_logits, sorted_indices = torch.sort(scores, descending=True)
+        probs = sorted_logits.softmax(dim=-1)
+
+        # Compute second derivative normalized CDF
+        d2 = probs.diff().diff().abs()
+        normalized_d2 = d2 / d2.sum(dim=-1, keepdim=True)
+        normalized_d2_cdf = normalized_d2.cumsum(dim=-1)
+
+        # Remove tokens with CDF value above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = normalized_d2_cdf > self.tfs
+
+        # Centre the distribution around the cutoff as in the original implementation of the algorithm
+        sorted_indices_to_remove = torch.cat(
+            (
+                torch.zeros(scores.shape[0], 1, dtype=torch.bool, device=scores.device),
+                sorted_indices_to_remove,
+                torch.ones(scores.shape[0], 1, dtype=torch.bool, device=scores.device),
+            ),
+            dim=-1,
+        )
+
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep
+            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+class TopALogitsWarper(LogitsWarper):
+    def __init__(self, top_a: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        top_a = float(top_a)
+        if top_a < 0 or top_a > 1.0:
+            raise ValueError(f"`top_a` has to be a float >= 0 and <= 1, but is {top_a}")
+        self.top_a = top_a
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        sorted_logits, sorted_indices = torch.sort(scores, descending=True)
+        probs = sorted_logits.softmax(dim=-1)
+
+        # Remove tokens with probability less than top_a*(max(probs))^2 (token with 0 are kept)
+        probs_max = probs[..., 0, None]
+        sorted_indices_to_remove = probs < probs_max * probs_max * self.top_a
+
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep
+            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+class MirostatLogitsWarper(LogitsWarper):
+    def __init__(self, mirostat_mode: int, mirostat_tau: float, mirostat_eta: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        if mirostat_mode not in [2]:
+            raise ValueError(f"`mirostat` has to be a an integer 2, but is {mirostat_mode}")
+
+        self.mirostat_mode = mirostat_mode
+        self.mirostat_eta = mirostat_eta
+        self.mirostat_tau = mirostat_tau
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+        self.mu = 2 * self.mirostat_tau
+        self.e = 0
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        logits = scores[0]
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        prob_original = torch.softmax(sorted_logits, dim=-1).tolist()  # candidates
+
+        # Truncate the words with surprise values greater than mu
+        for i, candidate in enumerate(prob_original):
+            if candidate > 0 and -math.log2(candidate) > self.mu:
+                if (i == 0):
+                    sorted_logits = sorted_logits[:1]
+                else:
+                    sorted_logits = sorted_logits[:i]
+                break
+
+        # Normalize the probabilities of the remaining words
+        if is_torch_xpu_available():
+            prob_topk = torch.softmax(sorted_logits, dim=0).to("xpu")
+            prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to("xpu")
+        else:
+            prob_topk = torch.softmax(sorted_logits, dim=0).to('cuda')
+            prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True).to('cuda')
+
+        observed_surprise = -math.log2(prob_topk[prev_i])
+        self.e = observed_surprise - self.mirostat_tau
+
+        # Update mu using the learning rate and error
+        self.mu -= self.mirostat_eta * self.e
+
+        sorted_indices_to_remove = torch.ones_like(scores[0], dtype=torch.bool)
+        sorted_indices_to_remove[prev_i] = False
+
+        indices_to_remove = sorted_indices_to_remove.unsqueeze(0).scatter(1, sorted_indices.unsqueeze(0), sorted_indices_to_remove.unsqueeze(0))
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+class SpyLogitsWarper(LogitsWarper):
+    def __init__(self):
+        pass
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        global global_scores
+        global_scores = scores
+        return scores
+
+
+class RepetitionPenaltyLogitsProcessorWithRange(LogitsProcessor):
+    '''
+    Copied from the transformers library
+    '''
+
+    def __init__(self, penalty: float, presence_penalty: float, frequency_penalty: float, _range: int):
+        if not (penalty > 0):
+            raise ValueError(f"`penalty` has to be strictly positive, but is {penalty}")
+
+        self.penalty = penalty
+        self.presence_penalty = presence_penalty
+        self.frequency_penalty = frequency_penalty
+        self._range = _range
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        input_ids = input_ids[:, -self._range:]
+
+        # We loop here because torch.unique() needs to process each row separately in the
+        # case that batch_size > 1.
+        for input_ids_row, scores_row in zip(input_ids, scores):
+            unique_ids, counts = torch.unique(input_ids_row, return_counts=True)
+            score = torch.gather(scores_row, 0, unique_ids)
+
+            # multiplicative repetition penalty
+            # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
+            score = torch.where(score < 0, score * self.penalty, score / self.penalty)
+            scores_row.scatter_(0, unique_ids, score)
+
+            # presence_penalty and frequency_penalty
+            raw_presence_penalty = (counts > 0).to(scores.dtype)
+            raw_frequency_penalty = counts.to(scores.dtype)
+            additive_penalty = raw_presence_penalty * self.presence_penalty + raw_frequency_penalty * self.frequency_penalty
+            scores_row.scatter_add_(0, unique_ids, -additive_penalty)
+
+        return scores
+
+
+def get_logits_warper_patch(self, generation_config):
+
+    # Parameter sanitization
+    if isinstance(generation_config.temperature, int):
+        generation_config.temperature = float(generation_config.temperature)  # Must be float
+
+    # Get the original warpers
+    warpers = self._get_logits_warper_old(generation_config)
+
+    # Replace temperature with our modified class.
+    # Currently, it behaves identically to the original.
+    for i in range(len(warpers)):
+        if warpers[i].__class__.__name__ == 'TemperatureLogitsWarper':
+            warpers[i] = TemperatureLogitsWarperCustom(
+                generation_config.temperature,
+            )
+
+    # Add custom warpers
+    warpers_to_add = LogitsProcessorList()
+    min_tokens_to_keep = 2 if generation_config.num_beams > 1 else 1
+    if generation_config.tfs is not None and 0.0 <= generation_config.tfs < 1.0:
+        warpers_to_add.append(
+            TailFreeLogitsWarper(
+                tfs=generation_config.tfs,
+                min_tokens_to_keep=min_tokens_to_keep
+            )
+        )
+
+    if generation_config.top_a is not None and 0.0 < generation_config.top_a <= 1.0:
+        warpers_to_add.append(
+            TopALogitsWarper(
+                top_a=generation_config.top_a,
+                min_tokens_to_keep=min_tokens_to_keep
+            )
+        )
+
+    if generation_config.min_p is not None and 0.0 < generation_config.min_p <= 1.0:
+        warpers_to_add.append(
+            MinPLogitsWarper(
+                min_p=generation_config.min_p,
+                min_tokens_to_keep=min_tokens_to_keep
+            )
+        )
+
+    if generation_config.dynamic_temperature:
+        warpers_to_add.append(
+            DynamicTemperatureLogitsWarper(
+                dynatemp_low=generation_config.dynatemp_low,
+                dynatemp_high=generation_config.dynatemp_high,
+                dynatemp_exponent=generation_config.dynatemp_exponent,
+            )
+        )
+
+    if generation_config.smoothing_factor > 0:
+        warpers_to_add.append(
+            QuadraticSamplingLogitsWarper(
+                smoothing_factor=generation_config.smoothing_factor
+            )
+        )
+
+    if generation_config.mirostat_mode is not None and generation_config.mirostat_mode == 2:
+        warpers_to_add.append(
+            MirostatLogitsWarper(
+                mirostat_mode=generation_config.mirostat_mode,
+                mirostat_eta=generation_config.mirostat_eta,
+                mirostat_tau=generation_config.mirostat_tau,
+                min_tokens_to_keep=min_tokens_to_keep
+            )
+        )
+
+    if len(warpers) > 0 and isinstance(warpers[-1], LogitNormalization):
+        normalize = warpers.pop(-1)
+    else:
+        normalize = None
+
+    warpers += warpers_to_add
+
+    # Sort the samplers.
+    sampler_priority = generation_config.sampler_priority
+
+    # Handle temperature_last
+    if generation_config.temperature_last:
+        for param_name in ['temperature', 'dynamic_temperature', 'quadratic_sampling']:
+            if param_name in sampler_priority:
+                if param_name in sampler_priority:
+                    index = sampler_priority.index(param_name)
+                    sampler_priority.append(sampler_priority.pop(index))
+                else:
+                    sampler_priority.append(param_name)
+
+    class_name_to_nickname = {
+        'DynamicTemperatureLogitsWarper': 'dynamic_temperature',
+        'EpsilonLogitsWarper': 'epsilon_cutoff',
+        'EtaLogitsWarper': 'eta_cutoff',
+        'MinPLogitsWarper': 'min_p',
+        'MirostatLogitsWarper': 'mirostat',
+        'QuadraticSamplingLogitsWarper': 'quadratic_sampling',
+        'TailFreeLogitsWarper': 'tfs',
+        'TemperatureLogitsWarperCustom': 'temperature',
+        'TopALogitsWarper': 'top_a',
+        'TopKLogitsWarper': 'top_k',
+        'TopPLogitsWarper': 'top_p',
+        'TypicalLogitsWarper': 'typical_p'
+    }
+
+    def custom_sort_key(obj):
+        class_name = obj.__class__.__name__
+
+        # Return a large value if class name is not mapped or if the mapped nickname is not in priority
+        if class_name not in class_name_to_nickname or class_name_to_nickname[class_name] not in sampler_priority:
+            return float('inf')
+
+        # Return the index of the nickname in the priority list for sorting
+        return sampler_priority.index(class_name_to_nickname[class_name])
+
+    # Sort the list using the custom key function
+    warpers = sorted(warpers, key=custom_sort_key)
+    if shared.args.verbose:
+        logger.info("WARPERS=")
+        pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint([x.__class__.__name__ for x in warpers])
+
+    if normalize is not None:
+        warpers.append(normalize)
+
+    warpers.append(SpyLogitsWarper())
+    warpers = LogitsProcessorList(warpers)
+    return warpers
+
+
+def get_logits_processor_patch(self, **kwargs):
+    repetition_penalty = kwargs['generation_config'].repetition_penalty
+    presence_penalty = kwargs['generation_config'].presence_penalty
+    frequency_penalty = kwargs['generation_config'].frequency_penalty
+    repetition_penalty_range = kwargs['generation_config'].repetition_penalty_range
+    do_rep_pen_hijack = (repetition_penalty > 1) or (presence_penalty != 0) or (frequency_penalty != 0)
+    if do_rep_pen_hijack:
+        kwargs['generation_config'].repetition_penalty = 1.1  # Set to value > 1 to ensure RepetitionPenaltyLogitsProcessor is created
+
+    result = self._get_logits_processor_old(**kwargs)
+
+    if do_rep_pen_hijack:
+        for i in range(len(result)):
+            if result[i].__class__.__name__ == 'RepetitionPenaltyLogitsProcessor':
+                result[i] = RepetitionPenaltyLogitsProcessorWithRange(repetition_penalty, presence_penalty, frequency_penalty, repetition_penalty_range)
+
+    return result
+
+
+def generation_config_init_patch(self, **kwargs):
+    self.__init___old(**kwargs)
+    self.min_p = kwargs.pop("min_p", 0.0)
+    self.dynamic_temperature = kwargs.pop("dynamic_temperature", False)
+    self.dynatemp_low = kwargs.pop("dynatemp_low", 1)
+    self.dynatemp_high = kwargs.pop("dynatemp_high", 1)
+    self.dynatemp_exponent = kwargs.pop("dynatemp_exponent", 1)
+    self.smoothing_factor = kwargs.pop("smoothing_factor", 0.0)
+    self.tfs = kwargs.pop("tfs", 1.0)
+    self.top_a = kwargs.pop("top_a", 0.0)
+    self.mirostat_mode = kwargs.pop("mirostat_mode", 0)
+    self.mirostat_eta = kwargs.pop("mirostat_eta", 0.1)
+    self.mirostat_tau = kwargs.pop("mirostat_tau", 5)
+    self.repetition_penalty_range = kwargs.pop("repetition_penalty_range", 0)
+    self.presence_penalty = kwargs.pop("presence_penalty", 0)
+    self.frequency_penalty = kwargs.pop("frequency_penalty", 0)
+    self.temperature_last = kwargs.pop("temperature_last", False)
+    self.sampler_priority = kwargs.pop("sampler_priority", ['temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat'])
+
+
+def hijack_samplers():
+    transformers.GenerationMixin._get_logits_warper_old = transformers.GenerationMixin._get_logits_warper
+    transformers.GenerationMixin._get_logits_warper = get_logits_warper_patch
+
+    transformers.GenerationMixin._get_logits_processor_old = transformers.GenerationMixin._get_logits_processor
+    transformers.GenerationMixin._get_logits_processor = get_logits_processor_patch
+
+    transformers.GenerationConfig.__init___old = transformers.GenerationConfig.__init__
+    transformers.GenerationConfig.__init__ = generation_config_init_patch
diff --git a/modules/shared.py b/modules/shared.py
new file mode 100644
index 0000000000000000000000000000000000000000..2861d6901956931db82c36efde1b278b8ed53849
--- /dev/null
+++ b/modules/shared.py
@@ -0,0 +1,307 @@
+import argparse
+import copy
+import os
+import sys
+from collections import OrderedDict
+from pathlib import Path
+
+import yaml
+
+from modules.logging_colors import logger
+
+# Model variables
+model = None
+tokenizer = None
+model_name = 'None'
+is_seq2seq = False
+model_dirty_from_training = False
+lora_names = []
+
+# Generation variables
+stop_everything = False
+generation_lock = None
+processing_message = '*Is typing...*'
+
+# UI variables
+gradio = {}
+persistent_interface_state = {}
+need_restart = False
+
+# UI defaults
+settings = {
+    'dark_theme': True,
+    'show_controls': True,
+    'start_with': '',
+    'mode': 'chat',
+    'chat_style': 'cai-chat',
+    'prompt-default': 'QA',
+    'prompt-notebook': 'QA',
+    'preset': 'simple-1',
+    'max_new_tokens': 512,
+    'max_new_tokens_min': 1,
+    'max_new_tokens_max': 4096,
+    'negative_prompt': '',
+    'seed': -1,
+    'truncation_length': 2048,
+    'truncation_length_min': 0,
+    'truncation_length_max': 200000,
+    'max_tokens_second': 0,
+    'max_updates_second': 0,
+    'prompt_lookup_num_tokens': 0,
+    'custom_stopping_strings': '',
+    'custom_token_bans': '',
+    'auto_max_new_tokens': False,
+    'ban_eos_token': False,
+    'add_bos_token': True,
+    'skip_special_tokens': True,
+    'stream': True,
+    'character': 'Assistant',
+    'name1': 'You',
+    'custom_system_message': '',
+    'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n    {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {{- '' + message['content'] + '\\n\\n' -}}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n        {%- else -%}\n            {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{-'### Response:\\n'-}}\n{%- endif -%}",
+    'chat_template_str': "{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {{- message['content'] + '\\n\\n' -}}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{- name1 + ': ' + message['content'] + '\\n'-}}\n        {%- else -%}\n            {{- name2 + ': ' + message['content'] + '\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}",
+    'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
+    'autoload_model': False,
+    'gallery-items_per_page': 50,
+    'gallery-open': False,
+    'default_extensions': ['gallery'],
+}
+
+default_settings = copy.deepcopy(settings)
+
+# Parser copied from https://github.com/vladmandic/automatic
+parser = argparse.ArgumentParser(description="Text generation web UI", conflict_handler='resolve', add_help=True, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=55, indent_increment=2, width=200))
+
+# Basic settings
+group = parser.add_argument_group('Basic settings')
+group.add_argument('--multi-user', action='store_true', help='Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.')
+group.add_argument('--character', type=str, help='The name of the character to load in chat mode by default.')
+group.add_argument('--model', type=str, help='Name of the model to load by default.')
+group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
+group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.')
+group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.')
+group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
+group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
+group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
+group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
+group.add_argument('--chat-buttons', action='store_true', help='Show buttons on the chat tab instead of a hover menu.')
+
+# Model loader
+group = parser.add_argument_group('Model loader')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.')
+
+# Transformers/Accelerate
+group = parser.add_argument_group('Transformers/Accelerate')
+group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
+group.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
+group.add_argument('--gpu-memory', type=str, nargs='+', help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.')
+group.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.')
+group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
+group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".')
+group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
+group.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
+group.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.')
+group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
+group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
+group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
+group.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
+
+# bitsandbytes 4-bit
+group = parser.add_argument_group('bitsandbytes 4-bit')
+group.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
+group.add_argument('--use_double_quant', action='store_true', help='use_double_quant for 4-bit.')
+group.add_argument('--compute_dtype', type=str, default='float16', help='compute dtype for 4-bit. Valid options: bfloat16, float16, float32.')
+group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.')
+
+# llama.cpp
+group = parser.add_argument_group('llama.cpp')
+group.add_argument('--tensorcores', action='store_true', help='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
+group.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
+group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
+group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
+group.add_argument('--no_mul_mat_q', action='store_true', help='Disable the mulmat kernels.')
+group.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
+group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
+group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
+group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
+group.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.')
+group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
+group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
+group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
+group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
+group.add_argument('--row_split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
+
+# ExLlamaV2
+group = parser.add_argument_group('ExLlamaV2')
+group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
+group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
+group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
+group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
+group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
+group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
+
+# AutoGPTQ
+group = parser.add_argument_group('AutoGPTQ')
+group.add_argument('--triton', action='store_true', help='Use triton.')
+group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')
+group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.')
+group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.')
+group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
+group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.')
+group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.')
+
+# GPTQ-for-LLaMa
+group = parser.add_argument_group('GPTQ-for-LLaMa')
+group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
+group.add_argument('--model_type', type=str, help='Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.')
+group.add_argument('--groupsize', type=int, default=-1, help='Group size.')
+group.add_argument('--pre_layer', type=int, nargs='+', help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60.')
+group.add_argument('--checkpoint', type=str, help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.')
+group.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.')
+
+# HQQ
+group = parser.add_argument_group('HQQ')
+group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
+
+# DeepSpeed
+group = parser.add_argument_group('DeepSpeed')
+group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
+group.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
+group.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.')
+
+# RoPE
+group = parser.add_argument_group('RoPE')
+group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.')
+group.add_argument('--rope_freq_base', type=int, default=0, help='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).')
+group.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.")
+
+# Gradio
+group = parser.add_argument_group('Gradio')
+group.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
+group.add_argument('--listen-port', type=int, help='The listening port that the server will use.')
+group.add_argument('--listen-host', type=str, help='The hostname that the server will use.')
+group.add_argument('--share', action='store_true', help='Create a public URL. This is useful for running the web UI on Google Colab or similar.')
+group.add_argument('--auto-launch', action='store_true', default=False, help='Open the web UI in the default browser upon launch.')
+group.add_argument('--gradio-auth', type=str, help='Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".', default=None)
+group.add_argument('--gradio-auth-path', type=str, help='Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.', default=None)
+group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certificate key file.', default=None)
+group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
+
+# API
+group = parser.add_argument_group('API')
+group.add_argument('--api', action='store_true', help='Enable the API extension.')
+group.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudfare.')
+group.add_argument('--public-api-id', type=str, help='Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.', default=None)
+group.add_argument('--api-port', type=int, default=5000, help='The listening port for the API.')
+group.add_argument('--api-key', type=str, default='', help='API authentication key.')
+group.add_argument('--admin-key', type=str, default='', help='API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.')
+group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')
+
+# Multimodal
+group = parser.add_argument_group('Multimodal')
+group.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')
+
+# Deprecated parameters
+# group = parser.add_argument_group('Deprecated')
+
+args = parser.parse_args()
+args_defaults = parser.parse_args([])
+provided_arguments = []
+for arg in sys.argv[1:]:
+    arg = arg.lstrip('-').replace('-', '_')
+    if hasattr(args, arg):
+        provided_arguments.append(arg)
+
+deprecated_args = []
+
+
+def do_cmd_flags_warnings():
+
+    # Deprecation warnings
+    for k in deprecated_args:
+        if getattr(args, k):
+            logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.')
+
+    # Security warnings
+    if args.trust_remote_code:
+        logger.warning('trust_remote_code is enabled. This is dangerous.')
+    if 'COLAB_GPU' not in os.environ and not args.nowebui:
+        if args.share:
+            logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.")
+        if any((args.listen, args.share)) and not any((args.gradio_auth, args.gradio_auth_path)):
+            logger.warning("\nYou are potentially exposing the web UI to the entire internet without any access password.\nYou can create one with the \"--gradio-auth\" flag like this:\n\n--gradio-auth username:password\n\nMake sure to replace username:password with your own.")
+            if args.multi_user:
+                logger.warning('\nThe multi-user mode is highly experimental and should not be shared publicly.')
+
+
+def fix_loader_name(name):
+    if not name:
+        return name
+
+    name = name.lower()
+    if name in ['llamacpp', 'llama.cpp', 'llama-cpp', 'llama cpp']:
+        return 'llama.cpp'
+    if name in ['llamacpp_hf', 'llama.cpp_hf', 'llama-cpp-hf', 'llamacpp-hf', 'llama.cpp-hf']:
+        return 'llamacpp_HF'
+    elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
+        return 'Transformers'
+    elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']:
+        return 'AutoGPTQ'
+    elif name in ['gptq-for-llama', 'gptqforllama', 'gptqllama', 'gptq for llama', 'gptq_for_llama']:
+        return 'GPTQ-for-LLaMa'
+    elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
+        return 'ExLlama'
+    elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
+        return 'ExLlamav2'
+    elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
+        return 'ExLlamav2_HF'
+    elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
+        return 'ctransformers'
+    elif name in ['autoawq', 'awq', 'auto-awq']:
+        return 'AutoAWQ'
+    elif name in ['quip#', 'quip-sharp', 'quipsharp', 'quip_sharp']:
+        return 'QuIP#'
+    elif name in ['hqq']:
+        return 'HQQ'
+
+
+def add_extension(name, last=False):
+    if args.extensions is None:
+        args.extensions = [name]
+    elif last:
+        args.extensions = [x for x in args.extensions if x != name]
+        args.extensions.append(name)
+    elif name not in args.extensions:
+        args.extensions.append(name)
+
+
+def is_chat():
+    return True
+
+
+args.loader = fix_loader_name(args.loader)
+
+# Activate the multimodal extension
+if args.multimodal_pipeline is not None:
+    add_extension('multimodal')
+
+# Activate the API extension
+if args.api or args.public_api:
+    add_extension('openai', last=True)
+
+# Load model-specific settings
+with Path(f'{args.model_dir}/config.yaml') as p:
+    if p.exists():
+        model_config = yaml.safe_load(open(p, 'r').read())
+    else:
+        model_config = {}
+
+# Load custom model-specific settings
+with Path(f'{args.model_dir}/config-user.yaml') as p:
+    if p.exists():
+        user_config = yaml.safe_load(open(p, 'r').read())
+    else:
+        user_config = {}
+
+model_config = OrderedDict(model_config)
+user_config = OrderedDict(user_config)
diff --git a/modules/text_generation.py b/modules/text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..1917a0c16230865238a4b6c3d1971c672f428578
--- /dev/null
+++ b/modules/text_generation.py
@@ -0,0 +1,447 @@
+import ast
+import copy
+import html
+import pprint
+import random
+import re
+import time
+import traceback
+
+import numpy as np
+import torch
+import transformers
+from transformers import LogitsProcessorList, is_torch_xpu_available
+
+import modules.shared as shared
+from modules.callbacks import (
+    Iteratorize,
+    Stream,
+    _StopEverythingStoppingCriteria
+)
+from modules.extensions import apply_extensions
+from modules.grammar.grammar_utils import initialize_grammar
+from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
+from modules.html_generator import generate_4chan_html, generate_basic_html
+from modules.logging_colors import logger
+from modules.models import clear_torch_cache, local_rank
+
+
+def generate_reply(*args, **kwargs):
+    shared.generation_lock.acquire()
+    try:
+        for result in _generate_reply(*args, **kwargs):
+            yield result
+    finally:
+        shared.generation_lock.release()
+
+
+def _generate_reply(question, state, stopping_strings=None, is_chat=False, escape_html=False, for_ui=False):
+
+    # Find the appropriate generation function
+    generate_func = apply_extensions('custom_generate_reply')
+    if generate_func is None:
+        if shared.model_name == 'None' or shared.model is None:
+            logger.error("No model is loaded! Select one in the Model tab.")
+            yield ''
+            return
+
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']:
+            generate_func = generate_reply_custom
+        else:
+            generate_func = generate_reply_HF
+
+    if generate_func != generate_reply_HF and shared.args.verbose:
+        logger.info("PROMPT=")
+        print(question)
+        print()
+
+    # Prepare the input
+    original_question = question
+    if not is_chat:
+        state = apply_extensions('state', state)
+        question = apply_extensions('input', question, state)
+
+    # Find the stopping strings
+    all_stop_strings = []
+    for st in (stopping_strings, state['custom_stopping_strings']):
+        if type(st) is str:
+            st = ast.literal_eval(f"[{st}]")
+
+        if type(st) is list and len(st) > 0:
+            all_stop_strings += st
+
+    shared.stop_everything = False
+    clear_torch_cache()
+    seed = set_manual_seed(state['seed'])
+    last_update = -1
+    reply = ''
+    is_stream = state['stream']
+    if len(all_stop_strings) > 0 and not state['stream']:
+        state = copy.deepcopy(state)
+        state['stream'] = True
+
+    min_update_interval = 0
+    if state.get('max_updates_second', 0) > 0:
+        min_update_interval = 1 / state['max_updates_second']
+
+    # Generate
+    for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
+        reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
+        if escape_html:
+            reply = html.escape(reply)
+        if is_stream:
+            cur_time = time.time()
+
+            # Maximum number of tokens/second
+            if state['max_tokens_second'] > 0:
+                diff = 1 / state['max_tokens_second'] - (cur_time - last_update)
+                if diff > 0:
+                    time.sleep(diff)
+
+                last_update = time.time()
+                yield reply
+
+            # Limit updates to avoid lag in the Gradio UI
+            # API updates are not limited
+            else:
+                if cur_time - last_update > min_update_interval:
+                    last_update = cur_time
+                    yield reply
+
+        if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
+            break
+
+    if not is_chat:
+        reply = apply_extensions('output', reply, state)
+
+    yield reply
+
+
+def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
+    if shared.tokenizer is None:
+        raise ValueError('No tokenizer is loaded')
+
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']:
+        input_ids = shared.tokenizer.encode(str(prompt))
+        if shared.model.__class__.__name__ not in ['Exllamav2Model']:
+            input_ids = np.array(input_ids).reshape(1, len(input_ids))
+    else:
+        input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
+        if not add_bos_token:
+            while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
+                input_ids = input_ids[:, 1:]
+
+    # Handling truncation
+    if truncation_length is not None:
+        input_ids = input_ids[:, -truncation_length:]
+
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
+        return input_ids
+    elif shared.args.deepspeed:
+        return input_ids.to(device=local_rank)
+    elif torch.backends.mps.is_available():
+        device = torch.device('mps')
+        return input_ids.to(device)
+    elif is_torch_xpu_available():
+        return input_ids.to("xpu:0")
+    else:
+        return input_ids.cuda()
+
+
+def decode(output_ids, skip_special_tokens=True):
+    if shared.tokenizer is None:
+        raise ValueError('No tokenizer is loaded')
+
+    return shared.tokenizer.decode(output_ids, skip_special_tokens=skip_special_tokens)
+
+
+def get_encoded_length(prompt):
+    length_after_extensions = apply_extensions('tokenized_length', prompt)
+    if length_after_extensions is not None:
+        return length_after_extensions
+
+    return len(encode(prompt)[0])
+
+
+def get_token_ids(prompt):
+    tokens = encode(prompt)[0]
+    decoded_tokens = [shared.tokenizer.decode([i]) for i in tokens]
+
+    output = ''
+    for row in list(zip(tokens, decoded_tokens)):
+        output += f"{str(int(row[0])).ljust(5)}  -  {repr(row[1])}\n"
+
+    return output
+
+
+def get_max_prompt_length(state):
+    return state['truncation_length'] - state['max_new_tokens']
+
+
+def generate_reply_wrapper(question, state, stopping_strings=None):
+    """
+    Returns formatted outputs for the UI
+    """
+    reply = question if not shared.is_seq2seq else ''
+    yield formatted_outputs(reply, shared.model_name)
+
+    for reply in generate_reply(question, state, stopping_strings, is_chat=False, escape_html=True, for_ui=True):
+        if not shared.is_seq2seq:
+            reply = question + reply
+
+        yield formatted_outputs(reply, shared.model_name)
+
+
+def formatted_outputs(reply, model_name):
+    if any(s in model_name for s in ['gpt-4chan', 'gpt4chan']):
+        reply = fix_gpt4chan(reply)
+        return html.unescape(reply), generate_4chan_html(reply)
+    else:
+        return html.unescape(reply), generate_basic_html(reply)
+
+
+def fix_gpt4chan(s):
+    """
+    Removes empty replies from gpt4chan outputs
+    """
+    for i in range(10):
+        s = re.sub("--- [0-9]*\n>>[0-9]*\n---", "---", s)
+        s = re.sub("--- [0-9]*\n *\n---", "---", s)
+        s = re.sub("--- [0-9]*\n\n\n---", "---", s)
+
+    return s
+
+
+def fix_galactica(s):
+    """
+    Fix the LaTeX equations in GALACTICA
+    """
+    s = s.replace(r'\[', r'$')
+    s = s.replace(r'\]', r'$')
+    s = s.replace(r'\(', r'$')
+    s = s.replace(r'\)', r'$')
+    s = s.replace(r'$$', r'$')
+    s = re.sub(r'\n', r'\n\n', s)
+    s = re.sub(r"\n{3,}", "\n\n", s)
+    return s
+
+
+def set_manual_seed(seed):
+    seed = int(seed)
+    if seed == -1:
+        seed = random.randint(1, 2**31)
+
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    elif is_torch_xpu_available():
+        torch.xpu.manual_seed_all(seed)
+
+    return seed
+
+
+def stop_everything_event():
+    shared.stop_everything = True
+
+
+def apply_stopping_strings(reply, all_stop_strings):
+    stop_found = False
+    for string in all_stop_strings:
+        idx = reply.find(string)
+        if idx != -1:
+            reply = reply[:idx]
+            stop_found = True
+            break
+
+    if not stop_found:
+        # If something like "\nYo" is generated just before "\nYou:"
+        # is completed, trim it
+        for string in all_stop_strings:
+            for j in range(len(string) - 1, 0, -1):
+                if reply[-j:] == string[:j]:
+                    reply = reply[:-j]
+                    break
+            else:
+                continue
+
+            break
+
+    return reply, stop_found
+
+
+def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
+    reply = decode(output_ids[starting_from:], state['skip_special_tokens'] if state else True)
+
+    # Handle tokenizers that do not add the leading space for the first token
+    if (hasattr(shared.tokenizer, 'convert_ids_to_tokens') and len(output_ids) > starting_from) and not reply.startswith(' '):
+        first_token = shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from]))
+        if isinstance(first_token, (bytes,)):
+            first_token = first_token.decode('utf8')
+
+        if first_token.startswith('▁'):
+            reply = ' ' + reply
+
+    return reply
+
+
+def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
+    generate_params = {}
+    for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'num_beams', 'length_penalty', 'early_stopping']:
+        if k in state:
+            generate_params[k] = state[k]
+
+    if isinstance(state['sampler_priority'], list):
+        generate_params['sampler_priority'] = state['sampler_priority']
+    elif isinstance(state['sampler_priority'], str):
+        generate_params['sampler_priority'] = [x.strip() for x in state['sampler_priority'].replace('\n', ',').split(',') if x.strip()]
+
+    if state['negative_prompt'] != '':
+        generate_params['negative_prompt_ids'] = encode(state['negative_prompt'])
+
+    if state['prompt_lookup_num_tokens'] > 0:
+        generate_params['prompt_lookup_num_tokens'] = state['prompt_lookup_num_tokens']
+
+    for k in ['epsilon_cutoff', 'eta_cutoff']:
+        if state[k] > 0:
+            generate_params[k] = state[k] * 1e-4
+
+    if state['ban_eos_token']:
+        generate_params['suppress_tokens'] = [shared.tokenizer.eos_token_id]
+
+    if state['custom_token_bans']:
+        to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
+        if len(to_ban) > 0:
+            if generate_params.get('suppress_tokens', None):
+                generate_params['suppress_tokens'] += to_ban
+            else:
+                generate_params['suppress_tokens'] = to_ban
+
+    generate_params.update({'use_cache': not shared.args.no_cache})
+    if shared.args.deepspeed:
+        generate_params.update({'synced_gpus': True})
+
+    # Encode the input
+    input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
+    output = input_ids[0]
+    cuda = not any((shared.args.cpu, shared.args.deepspeed))
+    if state['auto_max_new_tokens']:
+        generate_params['max_new_tokens'] = state['truncation_length'] - input_ids.shape[-1]
+
+    # Add the encoded tokens to generate_params
+    question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
+    original_input_ids = input_ids
+    generate_params.update({'inputs': input_ids})
+    if inputs_embeds is not None:
+        generate_params.update({'inputs_embeds': inputs_embeds})
+
+    # Stopping criteria / eos token
+    eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
+    generate_params['eos_token_id'] = eos_token_ids
+    generate_params['stopping_criteria'] = transformers.StoppingCriteriaList()
+    generate_params['stopping_criteria'].append(_StopEverythingStoppingCriteria())
+
+    # Logits processor
+    processor = state.get('logits_processor', LogitsProcessorList([]))
+    if not isinstance(processor, LogitsProcessorList):
+        processor = LogitsProcessorList([processor])
+
+    # Grammar
+    if state['grammar_string'].strip() != '':
+        grammar = initialize_grammar(state['grammar_string'])
+        grammar_processor = GrammarConstrainedLogitsProcessor(grammar)
+        processor.append(grammar_processor)
+
+    apply_extensions('logits_processor', processor, input_ids)
+    generate_params['logits_processor'] = processor
+
+    if shared.args.verbose:
+        logger.info("GENERATE_PARAMS=")
+        filtered_params = {key: value for key, value in generate_params.items() if not isinstance(value, torch.Tensor)}
+        pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(filtered_params)
+        print()
+
+        logger.info("PROMPT=")
+        print(decode(input_ids[0], skip_special_tokens=False))
+        print()
+
+    t0 = time.time()
+    try:
+        if not is_chat and not shared.is_seq2seq:
+            yield ''
+
+        # Generate the entire reply at once.
+        if not state['stream']:
+            with torch.no_grad():
+                output = shared.model.generate(**generate_params)[0]
+                if cuda:
+                    output = output.cuda()
+
+            starting_from = 0 if shared.is_seq2seq else len(input_ids[0])
+            yield get_reply_from_output_ids(output, state, starting_from=starting_from)
+
+        # Stream the reply 1 token at a time.
+        # This is based on the trick of using 'stopping_criteria' to create an iterator.
+        else:
+
+            def generate_with_callback(callback=None, *args, **kwargs):
+                kwargs['stopping_criteria'].append(Stream(callback_func=callback))
+                clear_torch_cache()
+                with torch.no_grad():
+                    shared.model.generate(**kwargs)
+
+            def generate_with_streaming(**kwargs):
+                return Iteratorize(generate_with_callback, [], kwargs, callback=None)
+
+            with generate_with_streaming(**generate_params) as generator:
+                cumulative_reply = ''
+                starting_from = 0 if shared.is_seq2seq else len(input_ids[0])
+                for output in generator:
+                    if output[-1] in eos_token_ids:
+                        break
+
+                    new_content = get_reply_from_output_ids(output, state, starting_from=starting_from)
+                    # check the partial unicode character
+                    if chr(0xfffd) in new_content:
+                        continue
+
+                    cumulative_reply += new_content
+                    starting_from = len(output)
+                    yield cumulative_reply
+
+    except Exception:
+        traceback.print_exc()
+    finally:
+        t1 = time.time()
+        original_tokens = len(original_input_ids[0])
+        new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
+        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
+        return
+
+
+def generate_reply_custom(question, original_question, seed, state, stopping_strings=None, is_chat=False):
+    """
+    For models that do not use the transformers library for sampling
+    """
+    seed = set_manual_seed(state['seed'])
+
+    t0 = time.time()
+    reply = ''
+    try:
+        if not is_chat:
+            yield ''
+
+        if not state['stream']:
+            reply = shared.model.generate(question, state)
+            yield reply
+        else:
+            for reply in shared.model.generate_with_streaming(question, state):
+                yield reply
+
+    except Exception:
+        traceback.print_exc()
+    finally:
+        t1 = time.time()
+        original_tokens = len(encode(original_question)[0])
+        new_tokens = len(encode(original_question + reply)[0]) - original_tokens
+        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
+        return
diff --git a/modules/training.py b/modules/training.py
new file mode 100644
index 0000000000000000000000000000000000000000..0410ddd136c479a5cda09dc890ab21ecac93d674
--- /dev/null
+++ b/modules/training.py
@@ -0,0 +1,783 @@
+import os
+
+os.environ["WANDB_MODE"] = "offline"
+# os.environ["WANDB_DISABLED"] = "true"
+
+import json
+import math
+import random
+import shutil
+import sys
+import threading
+import time
+import traceback
+from datetime import datetime
+from pathlib import Path
+
+import gradio as gr
+import torch
+import transformers
+from datasets import Dataset, load_dataset
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+    set_peft_model_state_dict
+)
+from peft.utils.other import \
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as model_to_lora_modules
+from transformers import is_torch_xpu_available
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+)
+
+from modules import shared, ui, utils
+from modules.evaluate import (
+    calculate_perplexity,
+    generate_markdown_table,
+    save_past_evaluations
+)
+from modules.logging_colors import logger
+from modules.models import reload_model
+from modules.utils import natural_keys
+
+MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
+PARAMETERS = ["lora_name", "always_override", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
+WANT_INTERRUPT = False
+
+train_log = {}
+train_template = {}
+
+
+def create_ui():
+    mu = shared.args.multi_user
+    with gr.Tab("Training", elem_id="training-tab"):
+        with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
+            tmp = gr.State('')
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("[Tutorial](https://github.com/oobabooga/text-generation-webui/wiki/05-%E2%80%90-Training-Tab)")
+
+                    with gr.Row():
+                        copy_from = gr.Dropdown(label='Copy parameters from', value='None', choices=utils.get_available_loras(), elem_classes=['slim-dropdown'], interactive=not mu)
+                        ui.create_refresh_button(copy_from, lambda: None, lambda: {'choices': utils.get_available_loras()}, 'refresh-button', interactive=not mu)
+
+                    with gr.Row():
+                        with gr.Column(scale=5):
+                            lora_name = gr.Textbox(label='Name', info='The name of your new LoRA file')
+                        with gr.Column():
+                            always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).', elem_classes=['no-background'])
+
+                    with gr.Accordion(label='Target Modules', open=False):
+                        gr.Markdown("Selects which modules to target in training. Targeting more modules is closer to a full fine-tune at the cost of increased VRAM requirements and adapter size.\nNOTE: Only works for model_id='llama', other types will retain default training behavior and not use these settings.")
+                        with gr.Row():
+                            with gr.Column():
+                                q_proj_en = gr.Checkbox(label='Enable q_proj', value=True)
+                            with gr.Column():
+                                v_proj_en = gr.Checkbox(label='Enable v_proj', value=True)
+                            with gr.Column():
+                                k_proj_en = gr.Checkbox(label='Enable k_proj', value=False)
+                            with gr.Column():
+                                o_proj_en = gr.Checkbox(label='Enable o_proj', value=False)
+                            with gr.Column():
+                                gate_proj_en = gr.Checkbox(label='Enable gate_proj', value=False)
+                            with gr.Column():
+                                down_proj_en = gr.Checkbox(label='Enable down_proj', value=False)
+                            with gr.Column():
+                                up_proj_en = gr.Checkbox(label='Enable up_proj', value=False)
+
+                    with gr.Row():
+                        with gr.Column():
+                            lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='Also called dimension count. Higher values = larger file, more content control. Smaller values = smaller file, less control. Use 4 or 8 for style, 128 or 256 to teach, 1024+ for fine-detail on big data. More VRAM is needed for higher ranks.')
+                            lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
+                            batch_size = gr.Slider(label='Batch Size', value=128, minimum=0, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
+                            micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
+                            cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=4096, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
+
+                        with gr.Column():
+                            save_steps = gr.Number(label='Save every n steps', value=0, info='If above 0, a checkpoint of the LoRA will be saved every time this many steps pass.')
+
+                            epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
+                            learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='In scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
+                            with gr.Row():
+                                lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.', elem_classes=['slim-dropdown'])
+
+                    with gr.Accordion(label='Advanced Options', open=False):
+                        with gr.Row():
+                            with gr.Column():
+                                lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
+                                stop_at_loss = gr.Slider(label='Stop at loss', minimum=0.0, maximum=3.0, step=0.1, value=0.00, info='The process will automatically stop once the desired loss value is reached. (reasonable numbers are 1.5-1.8)')
+                                with gr.Row():
+                                    optimizer = gr.Dropdown(label='Optimizer', value='adamw_torch', choices=['adamw_hf', 'adamw_torch', 'adamw_torch_fused', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'adamw_anyprecision', 'sgd', 'adagrad'], info='Different optimizer implementation options, for advanced users. Effects of different options are not well documented yet.', elem_classes=['slim-dropdown'])
+
+                            with gr.Column():
+                                warmup_steps = gr.Number(label='Warmup Steps', value=100, info='For this many steps at the start, the learning rate will be lower than normal. This helps the trainer prepare the model and precompute statistics to improve the quality of training after the start.')
+                                train_only_after = gr.Textbox(label='Train Only After', value='', info='Only consider text *after* this string in any given chunk for training. For Alpaca datasets, use "### Response:" to only train the response and ignore the input.')
+
+                                add_eos_token = gr.Checkbox(label='Add EOS token', value=False, info="Adds EOS token for each dataset item. In case of raw text, the EOS will be added at the Hard Cut")
+
+                                higher_rank_limit = gr.Checkbox(label='Enable higher ranks', value=False, info='If checked, changes Rank/Alpha slider above to go much higher. This will not work without a datacenter-class GPU.')
+                                report_to = gr.Radio(label="Save detailed logs with", value="None", choices=["None", "wandb", "tensorboard"], interactive=True)
+
+                with gr.Column():
+                    with gr.Tab(label='Formatted Dataset'):
+                        with gr.Row():
+                            format = gr.Dropdown(choices=utils.get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'], interactive=not mu)
+                            ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('training/formats', 'json')}, 'refresh-button', interactive=not mu)
+
+                        with gr.Row():
+                            dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu)
+                            ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button', interactive=not mu)
+
+                        with gr.Row():
+                            eval_dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'], interactive=not mu)
+                            ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button', interactive=not mu)
+
+                        eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
+
+                    with gr.Tab(label="Raw text file"):
+                        with gr.Row():
+                            raw_text_file = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu)
+                            ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'txt')}, 'refresh-button', interactive=not mu)
+
+                        with gr.Row():
+                            with gr.Column():
+                                overlap_len = gr.Slider(label='Overlap Length', minimum=0, maximum=512, value=128, step=16, info='How many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length). Setting overlap to exactly half the cutoff length may be ideal.')
+                                newline_favor_len = gr.Slider(label='Prefer Newline Cut Length', minimum=0, maximum=512, value=128, step=16, info='Length (in characters, not tokens) of the maximum distance to shift an overlap cut by to ensure chunks cut at newlines. If too low, cuts may occur in the middle of lines.')
+
+                            with gr.Column():
+                                hard_cut_string = gr.Textbox(label='Hard Cut String', value='\\n\\n\\n', info='String that indicates a hard cut between text parts. Helps prevent unwanted overlap.')
+                                min_chars = gr.Number(label='Ignore small blocks', value=0, info='Ignore Hard Cut blocks that have less or equal characters than this number')
+
+                    with gr.Row():
+                        start_button = gr.Button("Start LoRA Training", variant='primary', interactive=not mu)
+                        stop_button = gr.Button("Interrupt", interactive=not mu)
+
+                    output = gr.Markdown(value="Ready")
+
+        with gr.Tab('Perplexity evaluation', elem_id='evaluate-tab'):
+            with gr.Row():
+                with gr.Column():
+                    models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True, interactive=not mu)
+                    evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.', interactive=not mu)
+                    with gr.Row():
+                        with gr.Column():
+                            stride_length = gr.Slider(label='Stride', minimum=0, maximum=32768, value=512, step=256, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
+
+                        with gr.Column():
+                            max_length = gr.Slider(label='max_length', minimum=0, maximum=shared.settings['truncation_length_max'], value=0, step=256, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
+
+                    with gr.Row():
+                        start_current_evaluation = gr.Button("Evaluate loaded model", interactive=not mu)
+                        start_evaluation = gr.Button("Evaluate selected models", interactive=not mu)
+                        stop_evaluation = gr.Button("Interrupt", interactive=not mu)
+
+                with gr.Column():
+                    evaluation_log = gr.Markdown(value='')
+
+            evaluation_table = gr.Dataframe(value=generate_markdown_table(), interactive=True)
+            with gr.Row():
+                save_comments = gr.Button('Save comments', elem_classes="small-button", interactive=not mu)
+                refresh_table = gr.Button('Refresh the table', elem_classes="small-button", interactive=not mu)
+
+    # Training events
+    all_params = [lora_name, always_override, q_proj_en, v_proj_en, k_proj_en, o_proj_en, gate_proj_en, down_proj_en, up_proj_en, save_steps, micro_batch_size, batch_size, epochs, learning_rate, lr_scheduler_type, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, eval_steps, raw_text_file, overlap_len, newline_favor_len, higher_rank_limit, warmup_steps, optimizer, hard_cut_string, train_only_after, stop_at_loss, add_eos_token, min_chars, report_to]
+
+    copy_from.change(do_copy_params, [copy_from] + all_params, all_params)
+    start_button.click(do_train, all_params, output)
+    stop_button.click(do_interrupt, None, None, queue=False)
+    higher_rank_limit.change(change_rank_limit, [higher_rank_limit], [lora_rank, lora_alpha])
+
+    # Evaluation events. For some reason, the interrupt event
+    # doesn't work with the .then() syntax, so I write them one
+    # by one in this ugly but functional way.
+    ev = start_evaluation.click(calculate_perplexity, [models, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False)
+    start_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False)
+
+    start_current_evaluation.click(lambda: ['current model'], None, tmp)
+    ev_cur = start_current_evaluation.click(calculate_perplexity, [tmp, evaluate_text_file, stride_length, max_length], evaluation_log, show_progress=False)
+    start_current_evaluation.click(generate_markdown_table, None, evaluation_table, show_progress=False)
+
+    stop_evaluation.click(None, None, None, cancels=[ev, ev_cur], queue=False)
+    refresh_table.click(generate_markdown_table, None, evaluation_table, show_progress=True)
+    save_comments.click(
+        save_past_evaluations, evaluation_table, None).then(
+        lambda: "Comments saved.", None, evaluation_log, show_progress=False)
+
+
+def do_interrupt():
+    global WANT_INTERRUPT
+    WANT_INTERRUPT = True
+
+
+def do_copy_params(lora_name: str, *args):
+    f_name = f"{shared.args.lora_dir}/{clean_path(None, lora_name)}/training_parameters.json"
+    if Path(f_name).is_file():
+        with open(f_name, 'r', encoding='utf-8') as format_file:
+            params: dict[str, str] = json.load(format_file)
+    else:
+        params = {}
+
+    result = list()
+    for i in range(0, len(PARAMETERS)):
+        key = PARAMETERS[i]
+        if key in params:
+            result.append(params[key])
+        else:
+            result.append(args[i])
+
+    return result
+
+
+def change_rank_limit(use_higher_ranks: bool):
+    mult = 2 if use_higher_ranks else 1
+    return {"maximum": 1024 * mult, "__type__": "update"}, {"maximum": 2048 * mult, "__type__": "update"}
+
+
+def clean_path(base_path: str, path: str):
+    """Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
+    path = path.replace('\\', '/').replace('..', '_')
+    if base_path is None:
+        return path
+
+    return f'{Path(base_path).absolute()}/{path}'
+
+
+def backup_adapter(input_folder):
+    # Get the creation date of the file adapter_model.bin
+    try:
+        adapter_file = Path(f"{input_folder}/adapter_model.bin")
+        if adapter_file.is_file():
+
+            logger.info("Backing up existing LoRA adapter")
+            creation_date = datetime.fromtimestamp(adapter_file.stat().st_ctime)
+            creation_date_str = creation_date.strftime("Backup-%Y-%m-%d")
+
+            # Create the new subfolder
+            subfolder_path = Path(f"{input_folder}/{creation_date_str}")
+            subfolder_path.mkdir(parents=True, exist_ok=True)
+
+            # Check if the file already exists in the subfolder
+            backup_adapter_file = Path(f"{input_folder}/{creation_date_str}/adapter_model.bin")
+            if backup_adapter_file.is_file():
+                print(" - Backup already exists. Skipping backup process.")
+                return
+
+            # Copy existing files to the new subfolder
+            existing_files = Path(input_folder).iterdir()
+            for file in existing_files:
+                if file.is_file():
+                    shutil.copy2(file, subfolder_path)
+    except Exception as e:
+        print("An error occurred in backup_adapter:", str(e))
+
+
+def calc_trainable_parameters(model):
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        num_params = param.numel()
+        # if using DS Zero 3 and the weights are initialized empty
+        if num_params == 0 and hasattr(param, "ds_numel"):
+            num_params = param.ds_numel
+
+        all_param += num_params
+        if param.requires_grad:
+            trainable_params += num_params
+
+    return trainable_params, all_param
+
+
+def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
+
+    if shared.args.monkey_patch:
+        from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
+            replace_peft_model_with_int4_lora_model
+        )
+        replace_peft_model_with_int4_lora_model()
+
+    global WANT_INTERRUPT
+    WANT_INTERRUPT = False
+
+    # == Input validation / processing ==
+    yield "Preparing the input..."
+    lora_file_path = clean_path(None, lora_name)
+    if lora_file_path.strip() == '':
+        yield "Missing or invalid LoRA file name input."
+        return
+
+    lora_file_path = f"{Path(shared.args.lora_dir)}/{lora_file_path}"
+    actual_lr = float(learning_rate)
+    model_type = type(shared.model).__name__
+
+    if model_type in MODEL_CLASSES:
+        model_id = MODEL_CLASSES[model_type]
+    else:
+        model_id = "llama"
+        if model_type == "PeftModelForCausalLM":
+            if len(shared.lora_names) > 0:
+                yield "You are trying to train a LoRA while you already have another LoRA loaded. This will work, but may have unexpected effects. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*"
+                logger.warning("Training LoRA over top of another LoRA. May have unexpected effects.")
+            else:
+                yield "Model ID not matched due to LoRA loading. Consider reloading base model. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*"
+                logger.warning("Model ID not matched due to LoRA loading. Consider reloading base model.")
+        else:
+            yield "LoRA training has only currently been validated for LLaMA, OPT, GPT-J, and GPT-NeoX models. Unexpected errors may follow. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*"
+            logger.warning(f"LoRA training has only currently been validated for LLaMA, OPT, GPT-J, and GPT-NeoX models. (Found model type: {model_type})")
+
+        time.sleep(5)
+
+    if shared.args.loader == 'GPTQ-for-LLaMa' and not shared.args.monkey_patch:
+        yield "LoRA training with GPTQ-for-LLaMa requires loading with `--monkey-patch`"
+        return
+
+    if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
+        yield "Cannot input zeroes."
+        return
+
+    gradient_accumulation_steps = batch_size // micro_batch_size
+    shared.tokenizer.pad_token_id = 0
+    shared.tokenizer.padding_side = "left"
+
+    # Populate target_modules list with chosen X_proj modules. Llama-based models only atm, non-llama will revert to default behavior.
+    def list_target_modules(model_id):
+        if model_id != "llama" and model_id != "mistral":
+            return model_to_lora_modules[model_id]
+
+        available_modules = {
+            "gate": gate_proj_en,
+            "down": down_proj_en,
+            "up": up_proj_en,
+            "q": q_proj_en,
+            "v": v_proj_en,
+            "k": k_proj_en,
+            "o": o_proj_en,
+        }
+        target_mods = [f"{name}_proj" for name, enabled in available_modules.items() if enabled]
+        return target_mods
+
+    def encode(text, add_bos_token):
+        result = shared.tokenizer.encode(text, truncation=True, max_length=cutoff_len)
+        # Check if the first two tokens are BOS
+        if len(result) >= 2 and result[:2] == [shared.tokenizer.bos_token_id, shared.tokenizer.bos_token_id]:
+            result = result[1:]
+
+        if not add_bos_token and result[0] == shared.tokenizer.bos_token_id:
+            result = result[1:]
+        return result
+
+    def tokenize(prompt, append_eos_token=False):
+
+        if train_only_after == '' or train_only_after not in prompt:
+            input_ids = encode(prompt, True)
+
+            if append_eos_token and input_ids[-1] != shared.tokenizer.eos_token_id and len(input_ids) < cutoff_len:
+                input_ids.append(shared.tokenizer.eos_token_id)
+
+            input_ids = [shared.tokenizer.pad_token_id] * (cutoff_len - len(input_ids)) + input_ids
+            labels = [1] * len(input_ids)
+
+        else:
+            ind = prompt.index(train_only_after) + len(train_only_after)
+            before_tokens = encode(prompt[:ind], True)
+            after_tokens = encode(prompt[ind:], False)
+
+            if append_eos_token and after_tokens[-1] != shared.tokenizer.eos_token_id:
+                after_tokens.append(shared.tokenizer.eos_token_id)
+
+            full_length = len(after_tokens) + len(before_tokens)
+            if full_length > cutoff_len:
+                after_tokens = after_tokens[:cutoff_len - len(before_tokens)]
+            else:
+                before_tokens = [shared.tokenizer.pad_token_id] * (cutoff_len - full_length) + before_tokens
+
+            input_ids = before_tokens + after_tokens
+            labels = [-100] * len(before_tokens) + [1] * len(after_tokens)
+
+        input_ids = torch.tensor(input_ids)
+        return {
+            "input_ids": input_ids,
+            "labels": labels,
+            "attention_mask": input_ids.ne(shared.tokenizer.pad_token_id),
+        }
+
+    train_template.clear()
+
+    # == Prep the dataset, format, etc ==
+    if raw_text_file not in ['None', '']:
+        train_template["template_type"] = "raw_text"
+        logger.info("Loading raw text file dataset")
+        fullpath = clean_path('training/datasets', f'{raw_text_file}')
+        fullpath = Path(fullpath)
+        if fullpath.is_dir():
+            logger.info('Training path directory {}'.format(raw_text_file))
+            raw_text = ""
+            file_paths = sorted(fullpath.glob('*.txt'), key=lambda path: natural_keys(path.name))
+            for file_path in file_paths:
+                if file_path.is_file():
+                    with file_path.open('r', encoding='utf-8') as file:
+                        raw_text += file.read().replace('\r', '')
+
+                    logger.info(f"Loaded training file: {file_path.name}")
+        else:
+            with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
+                raw_text = file.read().replace('\r', '')
+
+        cut_string = hard_cut_string.replace('\\n', '\n')
+        eos_added = 0
+        out_tokens = []
+        for text_part in raw_text.split(cut_string):
+            if len(text_part.strip()) <= min_chars:
+                continue
+
+            tokens = shared.tokenizer.encode(text_part)
+            if add_eos_token:
+                tokens.append(shared.tokenizer.eos_token_id)
+                eos_added += 1
+
+            step = cutoff_len - overlap_len
+            if step <= 0:
+                yield f"Error: overlap_len ({overlap_len}) cannot be greater than or equal to cutoff_len ({cutoff_len})"
+                return
+
+            out_tokens.extend(split_chunks(tokens, cutoff_len, step))
+
+        if eos_added > 0:
+            print(f"EOS added to {eos_added} text blocks")
+
+        del raw_text  # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM
+        text_chunks = [shared.tokenizer.decode(x) for x in out_tokens]
+        del out_tokens
+        if newline_favor_len > 0:
+            text_chunks = [cut_chunk_for_newline(x, newline_favor_len) for x in text_chunks]
+
+        train_data = Dataset.from_list([tokenize(x) for x in text_chunks])
+        del text_chunks
+        eval_data = None
+    else:
+        if dataset in ['None', '']:
+            yield "Missing dataset choice input, cannot continue."
+            return
+
+        if format in ['None', '']:
+            yield "Missing format choice input, cannot continue."
+            return
+
+        train_template["template_type"] = "dataset"
+
+        with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
+            format_data: dict[str, str] = json.load(formatFile)
+
+        # == store training prompt ==
+        for _, value in format_data.items():
+            prompt_key = f"template_{len(train_template)}"
+            train_template[prompt_key] = value
+
+        def generate_prompt(data_point: dict[str, str]):
+            for options, data in format_data.items():
+                if set(options.split(',')) == set(x[0] for x in data_point.items() if (type(x[1]) is str and len(x[1].strip()) > 0)):
+                    for key, val in data_point.items():
+                        if type(val) is str:
+                            data = data.replace(f'%{key}%', val)
+                    return data
+            raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(format_data.keys())}"')
+
+        def generate_and_tokenize_prompt(data_point):
+            prompt = generate_prompt(data_point)
+            return tokenize(prompt, add_eos_token)
+
+        logger.info("Loading JSON datasets")
+        data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
+        train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
+
+        if eval_dataset == 'None':
+            eval_data = None
+        else:
+            eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json'))
+            eval_data = eval_data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
+
+    # == We MUST reload model if it went through any previous training, even failed one ==
+    if shared.model_dirty_from_training:
+        selected_model = shared.model_name
+        if selected_model:
+            print("\033[1;31;1m(Model has been modified by previous training, it needs to be reloaded...)\033[0;37;0m")
+            try:
+                yield f"Reloading {selected_model}..."
+                reload_model()
+                if shared.model is not None:
+                    print("Model reloaded OK, continue with training.")
+                else:
+                    return f"Failed to load {selected_model}."
+            except:
+                exc = traceback.format_exc()
+                logger.error('Failed to reload the model.')
+                print(exc)
+                return exc.replace('\n', '\n\n')
+
+    # == Start prepping the model itself ==
+    if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
+        logger.info("Getting model ready")
+        if 'quantization_config' in shared.model.config.to_dict():
+            prepare_model_for_kbit_training(shared.model)
+
+    # base model is now frozen and should not be reused for any other LoRA training than this one
+    shared.model_dirty_from_training = True
+
+    logger.info("Preparing for training")
+    config = LoraConfig(
+        r=lora_rank,
+        lora_alpha=lora_alpha,
+        target_modules=list_target_modules(model_id),
+        lora_dropout=lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM"
+    )
+
+    # == Backup the existing adapter ==
+    if not always_override:
+        backup_adapter(lora_file_path)
+
+    # == get model trainable params
+    model_trainable_params, model_all_params = calc_trainable_parameters(shared.model)
+
+    try:
+        logger.info("Creating LoRA model")
+        lora_model = get_peft_model(shared.model, config)
+        if not always_override and Path(f"{lora_file_path}/adapter_model.bin").is_file():
+            logger.info("Loading existing LoRA data")
+            state_dict_peft = torch.load(f"{lora_file_path}/adapter_model.bin", weights_only=True)
+            set_peft_model_state_dict(lora_model, state_dict_peft)
+    except:
+        yield traceback.format_exc().replace('\n', '\n\n')
+        return
+
+    if shared.args.monkey_patch:
+        from alpaca_lora_4bit.autograd_4bit import Autograd4bitQuantLinear
+        from alpaca_lora_4bit.models import Linear4bitLt
+        for _, m in lora_model.named_modules():
+            if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
+                if m.is_v1_model:
+                    m.zeros = m.zeros.half()
+                m.scales = m.scales.half()
+
+    class Tracked():
+        def __init__(self):
+            self.current_steps = 0
+            self.max_steps = 0
+            self.did_save = False
+
+    tracked = Tracked()
+    actual_save_steps = math.ceil(save_steps / gradient_accumulation_steps)
+
+    class Callbacks(transformers.TrainerCallback):
+        def on_step_begin(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs):
+            tracked.current_steps = state.global_step * gradient_accumulation_steps
+            tracked.max_steps = state.max_steps * gradient_accumulation_steps
+            if WANT_INTERRUPT:
+                control.should_epoch_stop = True
+                control.should_training_stop = True
+            elif state.global_step > 0 and actual_save_steps > 0 and state.global_step % actual_save_steps == 0:
+                lora_model.save_pretrained(f"{lora_file_path}/checkpoint-{tracked.current_steps}/")
+                # Save log
+                with open(f"{lora_file_path}/checkpoint-{tracked.current_steps}/training_log.json", 'w', encoding='utf-8') as file:
+                    json.dump(train_log, file, indent=2)
+                # == Save training prompt ==
+                with open(f"{lora_file_path}/checkpoint-{tracked.current_steps}/training_prompt.json", 'w', encoding='utf-8') as file:
+                    json.dump(train_template, file, indent=2)
+
+        def on_substep_end(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs):
+            tracked.current_steps += 1
+            if WANT_INTERRUPT:
+                control.should_epoch_stop = True
+                control.should_training_stop = True
+
+        def on_log(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs, **kwargs):
+            train_log.update(logs)
+            train_log.update({"current_steps": tracked.current_steps})
+            if WANT_INTERRUPT:
+                print("\033[1;31;1mInterrupted by user\033[0;37;0m")
+
+            print(f"\033[1;30;40mStep: {tracked.current_steps} \033[0;37;0m", end='')
+            if 'loss' in logs:
+                loss = float(logs['loss'])
+                if loss <= stop_at_loss:
+                    control.should_epoch_stop = True
+                    control.should_training_stop = True
+                    print(f"\033[1;31;1mStop Loss {stop_at_loss} reached.\033[0;37;0m")
+
+    # Fix training for mixed precision models
+    for param in shared.model.parameters():
+        if param.requires_grad:
+            param.data = param.data.float()
+
+    trainer = transformers.Trainer(
+        model=lora_model,
+        train_dataset=train_data,
+        eval_dataset=eval_data,
+        args=transformers.TrainingArguments(
+            report_to=report_to if report_to != "None" else None,
+            per_device_train_batch_size=micro_batch_size,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            warmup_steps=math.ceil(warmup_steps / gradient_accumulation_steps),
+            num_train_epochs=epochs,
+            learning_rate=actual_lr,
+            fp16=False if shared.args.cpu or shared.args.bf16 else True,
+            bf16=shared.args.bf16,
+            optim=optimizer,
+            logging_steps=2 if stop_at_loss > 0 else 5,
+            evaluation_strategy="steps" if eval_data is not None else "no",
+            eval_steps=math.ceil(eval_steps / gradient_accumulation_steps) if eval_data is not None else None,
+            save_strategy="steps" if eval_data is not None else "no",
+            output_dir=lora_file_path,
+            lr_scheduler_type=lr_scheduler_type,
+            load_best_model_at_end=eval_data is not None,
+            # TODO: Enable multi-device support
+            ddp_find_unused_parameters=None,
+            no_cuda=shared.args.cpu,
+            use_ipex=True if is_torch_xpu_available() and not shared.args.cpu else False
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
+        callbacks=list([Callbacks()])
+    )
+
+    lora_model.config.use_cache = False
+
+    if torch.__version__ >= "2" and sys.platform != "win32":
+        lora_model = torch.compile(lora_model)
+
+    # == Save parameters for reuse ==
+    with open(f"{lora_file_path}/training_parameters.json", 'w', encoding='utf-8') as file:
+        vars = locals()
+        json.dump({x: vars[x] for x in PARAMETERS}, file, indent=2)
+
+    # == Save training prompt ==
+    with open(f"{lora_file_path}/training_prompt.json", 'w', encoding='utf-8') as file:
+        json.dump(train_template, file, indent=2)
+
+    # == Main run and monitor loop ==
+    logger.info("Starting training")
+    yield "Starting..."
+
+    lora_trainable_param, lora_all_param = calc_trainable_parameters(lora_model)
+
+    projections_string = ", ".join([projection.replace("_proj", "") for projection in list_target_modules(model_id)])
+
+    print(f"Training '{model_id}' model using ({projections_string}) projections")
+
+    if lora_all_param > 0:
+        print(f"Trainable params: {lora_trainable_param:,d} ({100 * lora_trainable_param / lora_all_param:.4f} %), All params: {lora_all_param:,d} (Model: {model_all_params:,d})")
+
+    train_log.update({"base_model_name": shared.model_name})
+    train_log.update({"base_model_class": shared.model.__class__.__name__})
+    train_log.update({"base_loaded_in_4bit": getattr(lora_model, "is_loaded_in_4bit", False)})
+    train_log.update({"base_loaded_in_8bit": getattr(lora_model, "is_loaded_in_8bit", False)})
+    train_log.update({"projections": projections_string})
+
+    if stop_at_loss > 0:
+        print(f"Monitoring loss \033[1;31;1m(Auto-Stop at: {stop_at_loss})\033[0;37;0m")
+
+    if WANT_INTERRUPT:
+        yield "Interrupted before start."
+        return
+
+    def log_train_dataset(trainer):
+        decoded_entries = []
+        # Try to decode the entries and write the log file
+        try:
+            # Iterate over the first 10 elements in the dataset (or fewer if there are less than 10)
+            for i in range(min(10, len(trainer.train_dataset))):
+                decoded_text = shared.tokenizer.decode(trainer.train_dataset[i]['input_ids'])
+                decoded_entries.append({"value": decoded_text})
+
+            # Write the log file
+            Path('logs').mkdir(exist_ok=True)
+            with open(Path('logs/train_dataset_sample.json'), 'w') as json_file:
+                json.dump(decoded_entries, json_file, indent=4)
+
+            logger.info("Log file 'train_dataset_sample.json' created in the 'logs' directory.")
+        except Exception as e:
+            logger.error(f"Failed to create log file due to error: {e}")
+
+    def threaded_run():
+        log_train_dataset(trainer)
+        trainer.train()
+        # Note: save in the thread in case the gradio thread breaks (eg browser closed)
+        lora_model.save_pretrained(lora_file_path)
+        logger.info("LoRA training run is completed and saved.")
+        # Save log
+        with open(f"{lora_file_path}/training_log.json", 'w', encoding='utf-8') as file:
+            json.dump(train_log, file, indent=2)
+
+    thread = threading.Thread(target=threaded_run)
+    thread.start()
+    last_step = 0
+    start_time = time.perf_counter()
+
+    while thread.is_alive():
+        time.sleep(0.5)
+        if WANT_INTERRUPT:
+            yield "Interrupting, please wait... *(Run will stop after the current training step completes.)*"
+
+        elif tracked.current_steps != last_step:
+            last_step = tracked.current_steps
+            time_elapsed = time.perf_counter() - start_time
+            if time_elapsed <= 0:
+                timer_info = ""
+                total_time_estimate = 999
+            else:
+                its = tracked.current_steps / time_elapsed
+                if its > 1:
+                    timer_info = f"`{its:.2f}` it/s"
+                else:
+                    timer_info = f"`{1.0/its:.2f}` s/it"
+
+                total_time_estimate = (1.0 / its) * (tracked.max_steps)
+
+            yield f"Running... **{tracked.current_steps}** / **{tracked.max_steps}** ... {timer_info}, {format_time(time_elapsed)} / {format_time(total_time_estimate)} ... {format_time(total_time_estimate - time_elapsed)} remaining"
+
+    # Saving in the train thread might fail if an error occurs, so save here if so.
+    if not tracked.did_save:
+        logger.info("Training complete, saving")
+        lora_model.save_pretrained(lora_file_path)
+
+    if WANT_INTERRUPT:
+        logger.info("Training interrupted.")
+        yield f"Interrupted. Incomplete LoRA saved to `{lora_file_path}`."
+    else:
+        logger.info("Training complete!")
+        yield f"Done! LoRA saved to `{lora_file_path}`.\n\nBefore testing your new LoRA, make sure to first reload the model, as it is currently dirty from training."
+
+
+def split_chunks(arr, size, step):
+    for i in range(0, len(arr), step):
+        yield arr[i:i + size]
+
+
+def cut_chunk_for_newline(chunk: str, max_length: int):
+    if '\n' not in chunk:
+        return chunk
+
+    first_newline = chunk.index('\n')
+    if first_newline < max_length:
+        chunk = chunk[first_newline + 1:]
+
+    if '\n' not in chunk:
+        return chunk
+
+    last_newline = chunk.rindex('\n')
+    if len(chunk) - last_newline < max_length:
+        chunk = chunk[:last_newline]
+
+    return chunk
+
+
+def format_time(seconds: float):
+    if seconds < 120:
+        return f"`{seconds:.0f}` seconds"
+
+    minutes = seconds / 60
+    if minutes < 120:
+        return f"`{minutes:.0f}` minutes"
+
+    hours = minutes / 60
+    return f"`{hours:.0f}` hours"
diff --git a/modules/ui.py b/modules/ui.py
new file mode 100644
index 0000000000000000000000000000000000000000..06498f69bfd28976ec2cf050296f2fe4ba8eb710
--- /dev/null
+++ b/modules/ui.py
@@ -0,0 +1,268 @@
+import copy
+from pathlib import Path
+
+import gradio as gr
+import torch
+import yaml
+from transformers import is_torch_xpu_available
+
+import extensions
+from modules import shared
+
+with open(Path(__file__).resolve().parent / '../css/NotoSans/stylesheet.css', 'r') as f:
+    css = f.read()
+with open(Path(__file__).resolve().parent / '../css/main.css', 'r') as f:
+    css += f.read()
+with open(Path(__file__).resolve().parent / '../js/main.js', 'r') as f:
+    js = f.read()
+with open(Path(__file__).resolve().parent / '../js/save_files.js', 'r') as f:
+    save_files_js = f.read()
+with open(Path(__file__).resolve().parent / '../js/switch_tabs.js', 'r') as f:
+    switch_tabs_js = f.read()
+with open(Path(__file__).resolve().parent / '../js/show_controls.js', 'r') as f:
+    show_controls_js = f.read()
+with open(Path(__file__).resolve().parent / '../js/update_big_picture.js', 'r') as f:
+    update_big_picture_js = f.read()
+
+refresh_symbol = '🔄'
+delete_symbol = '🗑️'
+save_symbol = '💾'
+
+theme = gr.themes.Default(
+    font=['Noto Sans', 'Helvetica', 'ui-sans-serif', 'system-ui', 'sans-serif'],
+    font_mono=['IBM Plex Mono', 'ui-monospace', 'Consolas', 'monospace'],
+).set(
+    border_color_primary='#c5c5d2',
+    button_large_padding='6px 12px',
+    body_text_color_subdued='#484848',
+    background_fill_secondary='#eaeaea'
+)
+
+if Path("notification.mp3").exists():
+    audio_notification_js = "document.querySelector('#audio_notification audio')?.play();"
+else:
+    audio_notification_js = ""
+
+
+def list_model_elements():
+    elements = [
+        'loader',
+        'filter_by_loader',
+        'cpu_memory',
+        'auto_devices',
+        'disk',
+        'cpu',
+        'bf16',
+        'load_in_8bit',
+        'trust_remote_code',
+        'no_use_fast',
+        'use_flash_attention_2',
+        'load_in_4bit',
+        'compute_dtype',
+        'quant_type',
+        'use_double_quant',
+        'wbits',
+        'groupsize',
+        'model_type',
+        'pre_layer',
+        'triton',
+        'desc_act',
+        'no_inject_fused_attention',
+        'no_inject_fused_mlp',
+        'no_use_cuda_fp16',
+        'disable_exllama',
+        'disable_exllamav2',
+        'cfg_cache',
+        'no_flash_attn',
+        'num_experts_per_token',
+        'cache_8bit',
+        'threads',
+        'threads_batch',
+        'n_batch',
+        'no_mmap',
+        'mlock',
+        'no_mul_mat_q',
+        'n_gpu_layers',
+        'tensor_split',
+        'n_ctx',
+        'gpu_split',
+        'max_seq_len',
+        'compress_pos_emb',
+        'alpha_value',
+        'rope_freq_base',
+        'numa',
+        'logits_all',
+        'no_offload_kqv',
+        'row_split',
+        'tensorcores',
+        'hqq_backend',
+    ]
+    if is_torch_xpu_available():
+        for i in range(torch.xpu.device_count()):
+            elements.append(f'gpu_memory_{i}')
+    else:
+        for i in range(torch.cuda.device_count()):
+            elements.append(f'gpu_memory_{i}')
+
+    return elements
+
+
+def list_interface_input_elements():
+    elements = [
+        'max_new_tokens',
+        'auto_max_new_tokens',
+        'max_tokens_second',
+        'max_updates_second',
+        'prompt_lookup_num_tokens',
+        'seed',
+        'temperature',
+        'temperature_last',
+        'dynamic_temperature',
+        'dynatemp_low',
+        'dynatemp_high',
+        'dynatemp_exponent',
+        'smoothing_factor',
+        'top_p',
+        'min_p',
+        'top_k',
+        'typical_p',
+        'epsilon_cutoff',
+        'eta_cutoff',
+        'repetition_penalty',
+        'presence_penalty',
+        'frequency_penalty',
+        'repetition_penalty_range',
+        'encoder_repetition_penalty',
+        'no_repeat_ngram_size',
+        'min_length',
+        'do_sample',
+        'penalty_alpha',
+        'num_beams',
+        'length_penalty',
+        'early_stopping',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'grammar_string',
+        'negative_prompt',
+        'guidance_scale',
+        'add_bos_token',
+        'ban_eos_token',
+        'custom_token_bans',
+        'sampler_priority',
+        'truncation_length',
+        'custom_stopping_strings',
+        'skip_special_tokens',
+        'stream',
+        'tfs',
+        'top_a',
+    ]
+
+    # Chat elements
+    elements += [
+        'textbox',
+        'start_with',
+        'character_menu',
+        'history',
+        'name1',
+        'name2',
+        'greeting',
+        'context',
+        'mode',
+        'custom_system_message',
+        'instruction_template_str',
+        'chat_template_str',
+        'chat_style',
+        'chat-instruct_command',
+    ]
+
+    # Notebook/default elements
+    elements += [
+        'textbox-notebook',
+        'textbox-default',
+        'output_textbox',
+        'prompt_menu-default',
+        'prompt_menu-notebook',
+    ]
+
+    # Model elements
+    elements += list_model_elements()
+
+    return elements
+
+
+def gather_interface_values(*args):
+    output = {}
+    for i, element in enumerate(list_interface_input_elements()):
+        output[element] = args[i]
+
+    if not shared.args.multi_user:
+        shared.persistent_interface_state = output
+
+    return output
+
+
+def apply_interface_values(state, use_persistent=False):
+    if use_persistent:
+        state = shared.persistent_interface_state
+
+    elements = list_interface_input_elements()
+    if len(state) == 0:
+        return [gr.update() for k in elements]  # Dummy, do nothing
+    else:
+        return [state[k] if k in state else gr.update() for k in elements]
+
+
+def save_settings(state, preset, extensions_list, show_controls, theme_state):
+    output = copy.deepcopy(shared.settings)
+    exclude = ['name2', 'greeting', 'context', 'turn_template']
+    for k in state:
+        if k in shared.settings and k not in exclude:
+            output[k] = state[k]
+
+    output['preset'] = preset
+    output['prompt-default'] = state['prompt_menu-default']
+    output['prompt-notebook'] = state['prompt_menu-notebook']
+    output['character'] = state['character_menu']
+    output['default_extensions'] = extensions_list
+    output['seed'] = int(output['seed'])
+    output['show_controls'] = show_controls
+    output['dark_theme'] = True if theme_state == 'dark' else False
+
+    # Save extension values in the UI
+    for extension_name in extensions_list:
+        extension = getattr(extensions, extension_name).script
+        if hasattr(extension, 'params'):
+            params = getattr(extension, 'params')
+            for param in params:
+                _id = f"{extension_name}-{param}"
+                # Only save if different from default value
+                if param not in shared.default_settings or params[param] != shared.default_settings[param]:
+                    output[_id] = params[param]
+
+    # Do not save unchanged settings
+    for key in list(output.keys()):
+        if key in shared.default_settings and output[key] == shared.default_settings[key]:
+            output.pop(key)
+
+    return yaml.dump(output, sort_keys=False, width=float("inf"))
+
+
+def create_refresh_button(refresh_component, refresh_method, refreshed_args, elem_class, interactive=True):
+    """
+    Copied from https://github.com/AUTOMATIC1111/stable-diffusion-webui
+    """
+    def refresh():
+        refresh_method()
+        args = refreshed_args() if callable(refreshed_args) else refreshed_args
+
+        return gr.update(**(args or {}))
+
+    refresh_button = gr.Button(refresh_symbol, elem_classes=elem_class, interactive=interactive)
+    refresh_button.click(
+        fn=lambda: {k: tuple(v) if type(k) is list else v for k, v in refresh().items()},
+        inputs=[],
+        outputs=[refresh_component]
+    )
+
+    return refresh_button
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1b1af97e982905042ea7b4058b7ac8190924866
--- /dev/null
+++ b/modules/ui_chat.py
@@ -0,0 +1,371 @@
+import json
+from functools import partial
+from pathlib import Path
+
+import gradio as gr
+from PIL import Image
+
+from modules import chat, shared, ui, utils
+from modules.html_generator import chat_html_wrapper
+from modules.text_generation import stop_everything_event
+from modules.utils import gradio
+
+inputs = ('Chat input', 'interface_state')
+reload_arr = ('history', 'name1', 'name2', 'mode', 'chat_style', 'character_menu')
+clear_arr = ('delete_chat-confirm', 'delete_chat', 'delete_chat-cancel')
+
+
+def create_ui():
+    mu = shared.args.multi_user
+
+    shared.gradio['Chat input'] = gr.State()
+    shared.gradio['history'] = gr.State({'internal': [], 'visible': []})
+
+    with gr.Tab('Chat', elem_id='chat-tab', elem_classes=("old-ui" if shared.args.chat_buttons else None)):
+        with gr.Row():
+            with gr.Column(elem_id='chat-col'):
+                shared.gradio['display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', ''))
+
+                with gr.Row(elem_id="chat-input-row"):
+                    with gr.Column(scale=1, elem_id='gr-hover-container'):
+                        gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
+
+                    with gr.Column(scale=10, elem_id='chat-input-container'):
+                        shared.gradio['textbox'] = gr.Textbox(label='', placeholder='Send a message', elem_id='chat-input', elem_classes=['add_scrollbar'])
+                        shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
+                        shared.gradio['typing-dots'] = gr.HTML(value='<div class="typing"><span></span><span class="dot1"></span><span class="dot2"></span></div>', label='typing', elem_id='typing-container')
+
+                    with gr.Column(scale=1, elem_id='generate-stop-container'):
+                        with gr.Row():
+                            shared.gradio['Stop'] = gr.Button('Stop', elem_id='stop', visible=False)
+                            shared.gradio['Generate'] = gr.Button('Generate', elem_id='Generate', variant='primary')
+
+        # Hover menu buttons
+        with gr.Column(elem_id='chat-buttons'):
+            with gr.Row():
+                shared.gradio['Regenerate'] = gr.Button('Regenerate (Ctrl + Enter)', elem_id='Regenerate')
+                shared.gradio['Continue'] = gr.Button('Continue (Alt + Enter)', elem_id='Continue')
+                shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last')
+
+            with gr.Row():
+                shared.gradio['Replace last reply'] = gr.Button('Replace last reply (Ctrl + Shift + L)', elem_id='Replace-last')
+                shared.gradio['Copy last reply'] = gr.Button('Copy last reply (Ctrl + Shift + K)', elem_id='Copy-last')
+                shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate')
+
+            with gr.Row():
+                shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
+                shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
+
+            with gr.Row():
+                shared.gradio['send-chat-to-default'] = gr.Button('Send to default')
+                shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook')
+
+        with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
+            with gr.Column():
+                with gr.Row():
+                    shared.gradio['unique_id'] = gr.Dropdown(label='Past chats', elem_classes=['slim-dropdown'], interactive=not mu)
+
+                with gr.Row():
+                    shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['delete_chat-confirm'] = gr.Button('Confirm', variant='stop', visible=False, elem_classes='refresh-button')
+                    shared.gradio['delete_chat-cancel'] = gr.Button('Cancel', visible=False, elem_classes='refresh-button')
+                    shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes='refresh-button')
+
+                with gr.Row(elem_id='rename-row'):
+                    shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', visible=False, elem_classes=['no-background'])
+                    shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes='refresh-button')
+                    shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes='refresh-button')
+
+        with gr.Row():
+            shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'])
+
+        with gr.Row():
+            shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value='chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode')
+            shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
+
+
+def create_chat_settings_ui():
+    mu = shared.args.multi_user
+    with gr.Tab('Character'):
+        with gr.Row():
+            with gr.Column(scale=8):
+                with gr.Row():
+                    shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu)
+                    shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
+
+                shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Your name')
+                shared.gradio['name2'] = gr.Textbox(value='', lines=1, label='Character\'s name')
+                shared.gradio['context'] = gr.Textbox(value='', lines=10, label='Context', elem_classes=['add_scrollbar'])
+                shared.gradio['greeting'] = gr.Textbox(value='', lines=5, label='Greeting', elem_classes=['add_scrollbar'])
+
+            with gr.Column(scale=1):
+                shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil', interactive=not mu)
+                shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None, interactive=not mu)
+
+    with gr.Tab('Instruction template'):
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    shared.gradio['instruction_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), label='Saved instruction templates', value='Select template to load...', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['instruction_template'], lambda: None, lambda: {'choices': utils.get_available_instruction_templates()}, 'refresh-button', interactive=not mu)
+                    shared.gradio['load_template'] = gr.Button("Load", elem_classes='refresh-button')
+                    shared.gradio['save_template'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['delete_template'] = gr.Button('🗑️ ', elem_classes='refresh-button', interactive=not mu)
+
+            with gr.Column():
+                pass
+
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['custom_system_message'] = gr.Textbox(value=shared.settings['custom_system_message'], lines=2, label='Custom system message', info='If not empty, will be used instead of the default one.', elem_classes=['add_scrollbar'])
+                shared.gradio['instruction_template_str'] = gr.Textbox(value='', label='Instruction template', lines=24, info='Change this according to the model/LoRA that you are using. Used in instruct and chat-instruct modes.', elem_classes=['add_scrollbar', 'monospace'])
+                with gr.Row():
+                    shared.gradio['send_instruction_to_default'] = gr.Button('Send to default', elem_classes=['small-button'])
+                    shared.gradio['send_instruction_to_notebook'] = gr.Button('Send to notebook', elem_classes=['small-button'])
+                    shared.gradio['send_instruction_to_negative_prompt'] = gr.Button('Send to negative prompt', elem_classes=['small-button'])
+
+            with gr.Column():
+                shared.gradio['chat_template_str'] = gr.Textbox(value=shared.settings['chat_template_str'], label='Chat template', lines=22, elem_classes=['add_scrollbar', 'monospace'])
+                shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=4, label='Command for chat-instruct mode', info='<|character|> gets replaced by the bot name, and <|prompt|> gets replaced by the regular chat prompt.', elem_classes=['add_scrollbar'])
+
+    with gr.Tab('Chat history'):
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['save_chat_history'] = gr.Button(value='Save history')
+
+            with gr.Column():
+                shared.gradio['load_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt'], label='Upload History JSON')
+
+    with gr.Tab('Upload character'):
+        with gr.Tab('YAML or JSON'):
+            with gr.Row():
+                shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json', '.yaml'], label='JSON or YAML File', interactive=not mu)
+                shared.gradio['upload_img_bot'] = gr.Image(type='pil', label='Profile Picture (optional)', interactive=not mu)
+
+            shared.gradio['Submit character'] = gr.Button(value='Submit', interactive=False)
+
+        with gr.Tab('TavernAI PNG'):
+            with gr.Row():
+                with gr.Column():
+                    shared.gradio['upload_img_tavern'] = gr.Image(type='pil', label='TavernAI PNG File', elem_id='upload_img_tavern', interactive=not mu)
+                    shared.gradio['tavern_json'] = gr.State()
+                with gr.Column():
+                    shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False)
+                    shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=4, max_lines=4, label='Description', interactive=False)
+
+            shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False)
+
+
+def create_event_handlers():
+
+    # Obsolete variables, kept for compatibility with old extensions
+    shared.input_params = gradio(inputs)
+    shared.reload_inputs = gradio(reload_arr)
+
+    shared.gradio['Generate'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['textbox'].submit(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Regenerate'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_reply_wrapper, regenerate=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Continue'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_reply_wrapper, _continue=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Impersonate'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then(
+        chat.impersonate_wrapper, gradio(inputs), gradio('textbox', 'display'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Replace last reply'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.replace_last_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
+        lambda: '', None, gradio('textbox'), show_progress=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None)
+
+    shared.gradio['Send dummy message'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.send_dummy_message, gradio('textbox', 'interface_state'), gradio('history')).then(
+        lambda: '', None, gradio('textbox'), show_progress=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None)
+
+    shared.gradio['Send dummy reply'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.send_dummy_reply, gradio('textbox', 'interface_state'), gradio('history')).then(
+        lambda: '', None, gradio('textbox'), show_progress=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None)
+
+    shared.gradio['Remove last'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.remove_last_message, gradio('history'), gradio('textbox', 'history'), show_progress=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None)
+
+    shared.gradio['Stop'].click(
+        stop_everything_event, None, None, queue=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display'))
+
+    if not shared.args.multi_user:
+        shared.gradio['unique_id'].select(
+            chat.load_history, gradio('unique_id', 'character_menu', 'mode'), gradio('history')).then(
+            chat.redraw_html, gradio(reload_arr), gradio('display'))
+
+    shared.gradio['Start new chat'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.start_new_chat, gradio('interface_state'), gradio('history')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id'))
+
+    shared.gradio['delete_chat'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, gradio(clear_arr))
+    shared.gradio['delete_chat-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
+    shared.gradio['delete_chat-confirm'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x, y: str(chat.find_all_histories(x).index(y)), gradio('interface_state', 'unique_id'), gradio('temporary_text')).then(
+        chat.delete_history, gradio('unique_id', 'character_menu', 'mode'), None).then(
+        chat.load_history_after_deletion, gradio('interface_state', 'temporary_text'), gradio('history', 'unique_id')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr))
+
+    shared.gradio['rename_chat'].click(
+        lambda x: x, gradio('unique_id'), gradio('rename_to')).then(
+        lambda: [gr.update(visible=True)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
+
+    shared.gradio['rename_to-cancel'].click(
+        lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False)
+
+    shared.gradio['rename_to-confirm'].click(
+        chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then(
+        lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then(
+        lambda x, y: gr.update(choices=chat.find_all_histories(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id'))
+
+    shared.gradio['rename_to'].submit(
+        chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then(
+        lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then(
+        lambda x, y: gr.update(choices=chat.find_all_histories(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id'))
+
+    shared.gradio['load_chat_history'].upload(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.start_new_chat, gradio('interface_state'), gradio('history')).then(
+        chat.load_history_json, gradio('load_chat_history', 'history'), gradio('history')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then(
+        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}')
+
+    shared.gradio['character_menu'].change(
+        chat.load_character, gradio('character_menu', 'name1', 'name2'), gradio('name1', 'name2', 'character_picture', 'greeting', 'context')).success(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}')
+
+    shared.gradio['mode'].change(
+        lambda x: gr.update(visible=x != 'instruct'), gradio('mode'), gradio('chat_style'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.load_latest_history, gradio('interface_state'), gradio('history')).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display')).then(
+        lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id'))
+
+    shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'))
+    shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
+
+    # Save/delete a character
+    shared.gradio['save_character'].click(
+        lambda x: x, gradio('name2'), gradio('save_character_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('character_saver'))
+
+    shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter'))
+
+    shared.gradio['load_template'].click(
+        chat.load_instruction_template, gradio('instruction_template'), gradio('instruction_template_str')).then(
+        lambda: "Select template to load...", None, gradio('instruction_template'))
+
+    shared.gradio['save_template'].click(
+        lambda: 'My Template.yaml', None, gradio('save_filename')).then(
+        lambda: 'instruction-templates/', None, gradio('save_root')).then(
+        chat.generate_instruction_template_yaml, gradio('instruction_template_str'), gradio('save_contents')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_template'].click(
+        lambda x: f'{x}.yaml', gradio('instruction_template'), gradio('delete_filename')).then(
+        lambda: 'instruction-templates/', None, gradio('delete_root')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    shared.gradio['save_chat_history'].click(
+        lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then(
+        None, gradio('temporary_text', 'character_menu', 'mode'), None, _js=f'(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}')
+
+    shared.gradio['Submit character'].click(
+        chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
+
+    shared.gradio['Submit tavern character'].click(
+        chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}')
+
+    shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character'))
+    shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character'))
+    shared.gradio['upload_img_tavern'].upload(chat.check_tavern_character, gradio('upload_img_tavern'), gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
+    shared.gradio['upload_img_tavern'].clear(lambda: (None, None, None, gr.update(interactive=False)), None, gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False)
+    shared.gradio['your_picture'].change(
+        chat.upload_your_profile_picture, gradio('your_picture'), None).then(
+        partial(chat.redraw_html, reset_cache=True), gradio(reload_arr), gradio('display'))
+
+    shared.gradio['send_instruction_to_default'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then(
+        partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('textbox-default')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
+
+    shared.gradio['send_instruction_to_notebook'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then(
+        partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('textbox-notebook')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
+
+    shared.gradio['send_instruction_to_negative_prompt'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then(
+        partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('negative_prompt')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}')
+
+    shared.gradio['send-chat-to-default'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-default')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}')
+
+    shared.gradio['send-chat-to-notebook'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-notebook')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
+
+    shared.gradio['show_controls'].change(None, gradio('show_controls'), None, _js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
diff --git a/modules/ui_default.py b/modules/ui_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..7db6f0d93abcc36354b0d687d83c865b8f5dd406
--- /dev/null
+++ b/modules/ui_default.py
@@ -0,0 +1,104 @@
+import gradio as gr
+
+from modules import logits, shared, ui, utils
+from modules.prompts import count_tokens, load_prompt
+from modules.text_generation import (
+    generate_reply_wrapper,
+    get_token_ids,
+    stop_everything_event
+)
+from modules.utils import gradio
+
+inputs = ('textbox-default', 'interface_state')
+outputs = ('output_textbox', 'html-default')
+
+
+def create_ui():
+    mu = shared.args.multi_user
+    with gr.Tab('Default', elem_id='default-tab'):
+        shared.gradio['last_input-default'] = gr.State('')
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    shared.gradio['textbox-default'] = gr.Textbox(value='', lines=27, label='Input', elem_classes=['textbox_default', 'add_scrollbar'])
+                    shared.gradio['token-counter-default'] = gr.HTML(value="<span>0</span>", elem_classes=["token-counter", "default-token-counter"])
+
+                with gr.Row():
+                    shared.gradio['Generate-default'] = gr.Button('Generate', variant='primary')
+                    shared.gradio['Stop-default'] = gr.Button('Stop', elem_id='stop')
+                    shared.gradio['Continue-default'] = gr.Button('Continue')
+
+                with gr.Row():
+                    shared.gradio['prompt_menu-default'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['prompt_menu-default'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, 'refresh-button', interactive=not mu)
+                    shared.gradio['save_prompt-default'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['delete_prompt-default'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
+
+            with gr.Column():
+                with gr.Tab('Raw'):
+                    shared.gradio['output_textbox'] = gr.Textbox(lines=27, label='Output', elem_id='textbox-default', elem_classes=['textbox_default_output', 'add_scrollbar'])
+
+                with gr.Tab('Markdown'):
+                    shared.gradio['markdown_render-default'] = gr.Button('Render')
+                    shared.gradio['markdown-default'] = gr.Markdown()
+
+                with gr.Tab('HTML'):
+                    shared.gradio['html-default'] = gr.HTML()
+
+                with gr.Tab('Logits'):
+                    with gr.Row():
+                        with gr.Column(scale=10):
+                            shared.gradio['get_logits-default'] = gr.Button('Get next token probabilities')
+                        with gr.Column(scale=1):
+                            shared.gradio['use_samplers-default'] = gr.Checkbox(label='Use samplers', value=True, elem_classes=['no-background'])
+
+                    with gr.Row():
+                        shared.gradio['logits-default'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits', 'add_scrollbar'])
+                        shared.gradio['logits-default-previous'] = gr.Textbox(lines=23, label='Previous output', elem_classes=['textbox_logits', 'add_scrollbar'])
+
+                with gr.Tab('Tokens'):
+                    shared.gradio['get_tokens-default'] = gr.Button('Get token IDs for the input')
+                    shared.gradio['tokens-default'] = gr.Textbox(lines=23, label='Tokens', elem_classes=['textbox_logits', 'add_scrollbar', 'monospace'])
+
+
+def create_event_handlers():
+    shared.gradio['Generate-default'].click(
+        lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['textbox-default'].submit(
+        lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['markdown_render-default'].click(lambda x: x, gradio('output_textbox'), gradio('markdown-default'), queue=False)
+    shared.gradio['Continue-default'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, [shared.gradio['output_textbox']] + gradio(inputs)[1:], gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False)
+    shared.gradio['prompt_menu-default'].change(load_prompt, gradio('prompt_menu-default'), gradio('textbox-default'), show_progress=False)
+    shared.gradio['save_prompt-default'].click(
+        lambda x: x, gradio('textbox-default'), gradio('save_contents')).then(
+        lambda: 'prompts/', None, gradio('save_root')).then(
+        lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_prompt-default'].click(
+        lambda: 'prompts/', None, gradio('delete_root')).then(
+        lambda x: x + '.txt', gradio('prompt_menu-default'), gradio('delete_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    shared.gradio['textbox-default'].change(lambda x: f"<span>{count_tokens(x)}</span>", gradio('textbox-default'), gradio('token-counter-default'), show_progress=False)
+    shared.gradio['get_logits-default'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        logits.get_next_logits, gradio('textbox-default', 'interface_state', 'use_samplers-default', 'logits-default'), gradio('logits-default', 'logits-default-previous'), show_progress=False)
+
+    shared.gradio['get_tokens-default'].click(get_token_ids, gradio('textbox-default'), gradio('tokens-default'), show_progress=False)
diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py
new file mode 100644
index 0000000000000000000000000000000000000000..7147121773eb091a3c0bf6ebe14905783cf67405
--- /dev/null
+++ b/modules/ui_file_saving.py
@@ -0,0 +1,103 @@
+import gradio as gr
+
+from modules import chat, presets, shared, ui, utils
+from modules.utils import gradio
+
+
+def create_ui():
+    mu = shared.args.multi_user
+
+    # Text file saver
+    with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['file_saver']:
+        shared.gradio['save_filename'] = gr.Textbox(lines=1, label='File name')
+        shared.gradio['save_root'] = gr.Textbox(lines=1, label='File folder', info='For reference. Unchangeable.', interactive=False)
+        shared.gradio['save_contents'] = gr.Textbox(lines=10, label='File contents')
+        with gr.Row():
+            shared.gradio['save_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+            shared.gradio['save_confirm'] = gr.Button('Save', elem_classes="small-button", variant='primary', interactive=not mu)
+
+    # Text file deleter
+    with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['file_deleter']:
+        shared.gradio['delete_filename'] = gr.Textbox(lines=1, label='File name')
+        shared.gradio['delete_root'] = gr.Textbox(lines=1, label='File folder', info='For reference. Unchangeable.', interactive=False)
+        with gr.Row():
+            shared.gradio['delete_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+            shared.gradio['delete_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop', interactive=not mu)
+
+    # Character saver/deleter
+    with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['character_saver']:
+        shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your characters/ folder with this base filename.')
+        with gr.Row():
+            shared.gradio['save_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+            shared.gradio['save_character_confirm'] = gr.Button('Save', elem_classes="small-button", variant='primary', interactive=not mu)
+
+    with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['character_deleter']:
+        gr.Markdown('Confirm the character deletion?')
+        with gr.Row():
+            shared.gradio['delete_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+            shared.gradio['delete_character_confirm'] = gr.Button('Delete', elem_classes="small-button", variant='stop', interactive=not mu)
+
+    # Preset saver
+    with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['preset_saver']:
+        shared.gradio['save_preset_filename'] = gr.Textbox(lines=1, label='File name', info='The preset will be saved to your presets/ folder with this base filename.')
+        shared.gradio['save_preset_contents'] = gr.Textbox(lines=10, label='File contents')
+        with gr.Row():
+            shared.gradio['save_preset_cancel'] = gr.Button('Cancel', elem_classes="small-button")
+            shared.gradio['save_preset_confirm'] = gr.Button('Save', elem_classes="small-button", variant='primary', interactive=not mu)
+
+
+def create_event_handlers():
+    shared.gradio['save_confirm'].click(
+        lambda x, y, z: utils.save_file(x + y, z), gradio('save_root', 'save_filename', 'save_contents'), None).then(
+        lambda: gr.update(visible=False), None, gradio('file_saver'))
+
+    shared.gradio['delete_confirm'].click(
+        lambda x, y: utils.delete_file(x + y), gradio('delete_root', 'delete_filename'), None).then(
+        lambda: gr.update(visible=False), None, gradio('file_deleter'))
+
+    shared.gradio['delete_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_deleter'))
+    shared.gradio['save_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_saver'))
+
+    shared.gradio['save_character_confirm'].click(
+        chat.save_character, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), None).then(
+        lambda: gr.update(visible=False), None, gradio('character_saver')).then(
+        lambda x: gr.update(choices=utils.get_available_characters(), value=x), gradio('save_character_filename'), gradio('character_menu'))
+
+    shared.gradio['delete_character_confirm'].click(
+        lambda x: str(utils.get_available_characters().index(x)), gradio('character_menu'), gradio('temporary_text')).then(
+        chat.delete_character, gradio('character_menu'), None).then(
+        chat.update_character_menu_after_deletion, gradio('temporary_text'), gradio('character_menu')).then(
+        lambda: gr.update(visible=False), None, gradio('character_deleter'))
+
+    shared.gradio['save_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_saver'))
+    shared.gradio['delete_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_deleter'))
+
+    shared.gradio['save_preset'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        presets.generate_preset_yaml, gradio('interface_state'), gradio('save_preset_contents')).then(
+        lambda: 'My Preset', None, gradio('save_preset_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('preset_saver'))
+
+    shared.gradio['save_preset_confirm'].click(
+        lambda x, y: utils.save_file(f'presets/{x}.yaml', y), gradio('save_preset_filename', 'save_preset_contents'), None).then(
+        lambda: gr.update(visible=False), None, gradio('preset_saver')).then(
+        lambda x: gr.update(choices=utils.get_available_presets(), value=x), gradio('save_preset_filename'), gradio('preset_menu'))
+
+    shared.gradio['save_preset_cancel'].click(lambda: gr.update(visible=False), None, gradio('preset_saver'))
+
+    shared.gradio['delete_preset'].click(
+        lambda x: f'{x}.yaml', gradio('preset_menu'), gradio('delete_filename')).then(
+        lambda: 'presets/', None, gradio('delete_root')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    shared.gradio['save_grammar'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda x: x, gradio('grammar_string'), gradio('save_contents')).then(
+        lambda: 'grammars/', None, gradio('save_root')).then(
+        lambda: 'My Fancy Grammar.gbnf', None, gradio('save_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_grammar'].click(
+        lambda x: x, gradio('grammar_file'), gradio('delete_filename')).then(
+        lambda: 'grammars/', None, gradio('delete_root')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
new file mode 100644
index 0000000000000000000000000000000000000000..23679097730df1b6cb9f3b694213c3679bd42f36
--- /dev/null
+++ b/modules/ui_model_menu.py
@@ -0,0 +1,280 @@
+import importlib
+import math
+import re
+import traceback
+from functools import partial
+from pathlib import Path
+
+import gradio as gr
+import psutil
+import torch
+from transformers import is_torch_xpu_available
+
+from modules import loaders, shared, ui, utils
+from modules.logging_colors import logger
+from modules.LoRA import add_lora_to_model
+from modules.models import load_model, unload_model
+from modules.models_settings import (
+    apply_model_settings_to_state,
+    get_model_metadata,
+    save_model_settings,
+    update_model_parameters
+)
+from modules.utils import gradio
+
+
+def create_ui():
+    mu = shared.args.multi_user
+
+    # Finding the default values for the GPU and CPU memories
+    total_mem = []
+    if is_torch_xpu_available():
+        for i in range(torch.xpu.device_count()):
+            total_mem.append(math.floor(torch.xpu.get_device_properties(i).total_memory / (1024 * 1024)))
+    else:
+        for i in range(torch.cuda.device_count()):
+            total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
+
+    default_gpu_mem = []
+    if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
+        for i in shared.args.gpu_memory:
+            if 'mib' in i.lower():
+                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
+            else:
+                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
+
+    while len(default_gpu_mem) < len(total_mem):
+        default_gpu_mem.append(0)
+
+    total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))
+    if shared.args.cpu_memory is not None:
+        default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)
+    else:
+        default_cpu_mem = 0
+
+    with gr.Tab("Model", elem_id="model-tab"):
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Row():
+                            shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
+                            ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
+                            shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu)
+                            shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
+                            shared.gradio['reload_model'] = gr.Button("Reload", elem_classes='refresh-button', interactive=not mu)
+                            shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
+
+                    with gr.Column():
+                        with gr.Row():
+                            shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
+                            ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
+                            shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
+
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
+                with gr.Box():
+                    with gr.Row():
+                        with gr.Column():
+                            with gr.Blocks():
+                                for i in range(len(total_mem)):
+                                    shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
+
+                                shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
+
+                            with gr.Blocks():
+                                shared.gradio['transformers_info'] = gr.Markdown('load-in-4bit params:')
+                                shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype)
+                                shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type)
+
+                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
+                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers)
+                            shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.')
+                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 18,17')
+                            shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
+                            shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
+                            shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=32, value=shared.args.threads_batch)
+                            shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
+                            shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
+                            shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None")
+                            shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
+                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
+                            shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len)
+                            with gr.Blocks():
+                                shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value)
+                                shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base)
+                                shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb)
+
+                            shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
+                            shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.')
+
+                        with gr.Column():
+                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
+                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
+                            shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
+                            shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
+                            shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
+                            shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards.')
+                            shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
+                            shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
+                            shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
+                            shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
+                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
+                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
+                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
+                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
+                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
+                            shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
+                            shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
+                            shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
+                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
+                            shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
+                            shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
+                            shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
+                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
+                            shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
+                            with gr.Blocks():
+                                shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
+                                shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
+                                shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
+
+                            shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.')
+                            shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.')
+                            shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.')
+                            shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
+                            shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.')
+
+            with gr.Column():
+                with gr.Row():
+                    shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)
+
+                shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
+                shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu)
+                with gr.Row():
+                    shared.gradio['download_model_button'] = gr.Button("Download", variant='primary', interactive=not mu)
+                    shared.gradio['get_file_list'] = gr.Button("Get file list", interactive=not mu)
+
+                with gr.Row():
+                    shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready')
+
+
+def create_event_handlers():
+    shared.gradio['loader'].change(
+        loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params())).then(
+        lambda value: gr.update(choices=loaders.get_model_types(value)), gradio('loader'), gradio('model_type'))
+
+    # In this event handler, the interface state is read and updated
+    # with the model defaults (if any), and then the model is loaded
+    # unless "autoload_model" is unchecked
+    shared.gradio['model_menu'].change(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        apply_model_settings_to_state, gradio('model_menu', 'interface_state'), gradio('interface_state')).then(
+        ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then(
+        update_model_parameters, gradio('interface_state'), None).then(
+        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False).success(
+        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length')).then(
+        lambda x: x, gradio('loader'), gradio('filter_by_loader'))
+
+    shared.gradio['load_model'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        update_model_parameters, gradio('interface_state'), None).then(
+        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(
+        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length')).then(
+        lambda x: x, gradio('loader'), gradio('filter_by_loader'))
+
+    shared.gradio['reload_model'].click(
+        unload_model, None, None).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        update_model_parameters, gradio('interface_state'), None).then(
+        partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success(
+        update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length')).then(
+        lambda x: x, gradio('loader'), gradio('filter_by_loader'))
+
+    shared.gradio['unload_model'].click(
+        unload_model, None, None).then(
+        lambda: "Model unloaded", None, gradio('model_status'))
+
+    shared.gradio['save_model_settings'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
+
+    shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
+    shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
+    shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
+    shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
+
+
+def load_model_wrapper(selected_model, loader, autoload=False):
+    if not autoload:
+        yield f"The settings for `{selected_model}` have been updated.\n\nClick on \"Load\" to load it."
+        return
+
+    if selected_model == 'None':
+        yield "No model selected"
+    else:
+        try:
+            yield f"Loading `{selected_model}`..."
+            unload_model()
+            if selected_model != '':
+                shared.model, shared.tokenizer = load_model(selected_model, loader)
+
+            if shared.model is not None:
+                output = f"Successfully loaded `{selected_model}`."
+
+                settings = get_model_metadata(selected_model)
+                if 'instruction_template' in settings:
+                    output += '\n\nIt seems to be an instruction-following model with template "{}". In the chat tab, instruct or chat-instruct modes should be used.'.format(settings['instruction_template'])
+
+                yield output
+            else:
+                yield f"Failed to load `{selected_model}`."
+        except:
+            exc = traceback.format_exc()
+            logger.error('Failed to load the model.')
+            print(exc)
+            yield exc.replace('\n', '\n\n')
+
+
+def load_lora_wrapper(selected_loras):
+    yield ("Applying the following LoRAs to {}:\n\n{}".format(shared.model_name, '\n'.join(selected_loras)))
+    add_lora_to_model(selected_loras)
+    yield ("Successfuly applied the LoRAs")
+
+
+def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
+    try:
+        progress(0.0)
+        downloader = importlib.import_module("download-model").ModelDownloader()
+        model, branch = downloader.sanitize_model_and_branch_names(repo_id, None)
+        yield ("Getting the download links from Hugging Face")
+        links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
+        if return_links:
+            yield '\n\n'.join([f"`{Path(link).name}`" for link in links])
+            return
+
+        yield ("Getting the output folder")
+        base_folder = shared.args.lora_dir if is_lora else shared.args.model_dir
+        output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp, base_folder=base_folder)
+        if check:
+            progress(0.5)
+            yield ("Checking previously downloaded files")
+            downloader.check_model_files(model, branch, links, sha256, output_folder)
+            progress(1.0)
+        else:
+            yield (f"Downloading file{'s' if len(links) > 1 else ''} to `{output_folder}/`")
+            downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=4, is_llamacpp=is_llamacpp)
+            yield ("Done!")
+    except:
+        progress(1.0)
+        yield traceback.format_exc().replace('\n', '\n\n')
+
+
+def update_truncation_length(current_length, state):
+    if 'loader' in state:
+        if state['loader'].lower().startswith('exllama'):
+            return state['max_seq_len']
+        elif state['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
+            return state['n_ctx']
+
+    return current_length
diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bd5c919f797a30003f291ed40ca82a924f760e7
--- /dev/null
+++ b/modules/ui_notebook.py
@@ -0,0 +1,106 @@
+import gradio as gr
+
+from modules import logits, shared, ui, utils
+from modules.prompts import count_tokens, load_prompt
+from modules.text_generation import (
+    generate_reply_wrapper,
+    get_token_ids,
+    stop_everything_event
+)
+from modules.utils import gradio
+
+inputs = ('textbox-notebook', 'interface_state')
+outputs = ('textbox-notebook', 'html-notebook')
+
+
+def create_ui():
+    mu = shared.args.multi_user
+    with gr.Tab('Notebook', elem_id='notebook-tab'):
+        shared.gradio['last_input-notebook'] = gr.State('')
+        with gr.Row():
+            with gr.Column(scale=4):
+                with gr.Tab('Raw'):
+                    with gr.Row():
+                        shared.gradio['textbox-notebook'] = gr.Textbox(value='', lines=27, elem_id='textbox-notebook', elem_classes=['textbox', 'add_scrollbar'])
+                        shared.gradio['token-counter-notebook'] = gr.HTML(value="<span>0</span>", elem_classes=["token-counter"])
+
+                with gr.Tab('Markdown'):
+                    shared.gradio['markdown_render-notebook'] = gr.Button('Render')
+                    shared.gradio['markdown-notebook'] = gr.Markdown()
+
+                with gr.Tab('HTML'):
+                    shared.gradio['html-notebook'] = gr.HTML()
+
+                with gr.Tab('Logits'):
+                    with gr.Row():
+                        with gr.Column(scale=10):
+                            shared.gradio['get_logits-notebook'] = gr.Button('Get next token probabilities')
+                        with gr.Column(scale=1):
+                            shared.gradio['use_samplers-notebook'] = gr.Checkbox(label='Use samplers', value=True, elem_classes=['no-background'])
+
+                    with gr.Row():
+                        shared.gradio['logits-notebook'] = gr.Textbox(lines=23, label='Output', elem_classes=['textbox_logits_notebook', 'add_scrollbar'])
+                        shared.gradio['logits-notebook-previous'] = gr.Textbox(lines=23, label='Previous output', elem_classes=['textbox_logits_notebook', 'add_scrollbar'])
+
+                with gr.Tab('Tokens'):
+                    shared.gradio['get_tokens-notebook'] = gr.Button('Get token IDs for the input')
+                    shared.gradio['tokens-notebook'] = gr.Textbox(lines=23, label='Tokens', elem_classes=['textbox_logits_notebook', 'add_scrollbar', 'monospace'])
+
+                with gr.Row():
+                    shared.gradio['Generate-notebook'] = gr.Button('Generate', variant='primary', elem_classes='small-button')
+                    shared.gradio['Stop-notebook'] = gr.Button('Stop', elem_classes='small-button', elem_id='stop')
+                    shared.gradio['Undo'] = gr.Button('Undo', elem_classes='small-button')
+                    shared.gradio['Regenerate-notebook'] = gr.Button('Regenerate', elem_classes='small-button')
+
+            with gr.Column(scale=1):
+                gr.HTML('<div style="padding-bottom: 13px"></div>')
+                with gr.Row():
+                    shared.gradio['prompt_menu-notebook'] = gr.Dropdown(choices=utils.get_available_prompts(), value='None', label='Prompt', elem_classes='slim-dropdown')
+                    ui.create_refresh_button(shared.gradio['prompt_menu-notebook'], lambda: None, lambda: {'choices': utils.get_available_prompts()}, ['refresh-button', 'refresh-button-small'], interactive=not mu)
+                    shared.gradio['save_prompt-notebook'] = gr.Button('💾', elem_classes=['refresh-button', 'refresh-button-small'], interactive=not mu)
+                    shared.gradio['delete_prompt-notebook'] = gr.Button('🗑️', elem_classes=['refresh-button', 'refresh-button-small'], interactive=not mu)
+
+
+def create_event_handlers():
+    shared.gradio['Generate-notebook'].click(
+        lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['textbox-notebook'].submit(
+        lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Undo'].click(lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False)
+    shared.gradio['markdown_render-notebook'].click(lambda x: x, gradio('textbox-notebook'), gradio('markdown-notebook'), queue=False)
+    shared.gradio['Regenerate-notebook'].click(
+        lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        lambda: None, None, None, _js=f'() => {{{ui.audio_notification_js}}}')
+
+    shared.gradio['Stop-notebook'].click(stop_everything_event, None, None, queue=False)
+    shared.gradio['prompt_menu-notebook'].change(load_prompt, gradio('prompt_menu-notebook'), gradio('textbox-notebook'), show_progress=False)
+    shared.gradio['save_prompt-notebook'].click(
+        lambda x: x, gradio('textbox-notebook'), gradio('save_contents')).then(
+        lambda: 'prompts/', None, gradio('save_root')).then(
+        lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+    shared.gradio['delete_prompt-notebook'].click(
+        lambda: 'prompts/', None, gradio('delete_root')).then(
+        lambda x: x + '.txt', gradio('prompt_menu-notebook'), gradio('delete_filename')).then(
+        lambda: gr.update(visible=True), None, gradio('file_deleter'))
+
+    shared.gradio['textbox-notebook'].input(lambda x: f"<span>{count_tokens(x)}</span>", gradio('textbox-notebook'), gradio('token-counter-notebook'), show_progress=False)
+    shared.gradio['get_logits-notebook'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        logits.get_next_logits, gradio('textbox-notebook', 'interface_state', 'use_samplers-notebook', 'logits-notebook'), gradio('logits-notebook', 'logits-notebook-previous'), show_progress=False)
+
+    shared.gradio['get_tokens-notebook'].click(get_token_ids, gradio('textbox-notebook'), gradio('tokens-notebook'), show_progress=False)
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..078590dc119340064b081108da58d031d405a88e
--- /dev/null
+++ b/modules/ui_parameters.py
@@ -0,0 +1,124 @@
+from pathlib import Path
+
+import gradio as gr
+
+from modules import loaders, presets, shared, ui, ui_chat, utils
+from modules.utils import gradio
+
+
+def create_ui(default_preset):
+    mu = shared.args.multi_user
+    generate_params = presets.load_preset(default_preset)
+    with gr.Tab("Parameters", elem_id="parameters"):
+        with gr.Tab("Generation"):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Preset', elem_classes='slim-dropdown')
+                        ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button', interactive=not mu)
+                        shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
+                        shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
+                        shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button')
+
+                with gr.Column():
+                    shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
+
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        with gr.Column():
+                            shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
+                            shared.gradio['temperature'] = gr.Slider(0.01, 5, value=generate_params['temperature'], step=0.01, label='temperature')
+                            shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
+                            shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
+                            shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
+                            shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
+                            shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty')
+                            shared.gradio['frequency_penalty'] = gr.Slider(0, 2, value=generate_params['frequency_penalty'], step=0.05, label='frequency_penalty')
+                            shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range')
+                            shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
+                            shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
+                            shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
+                            shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff')
+                            shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff')
+
+                        with gr.Column():
+                            shared.gradio['guidance_scale'] = gr.Slider(-0.5, 2.5, step=0.05, value=generate_params['guidance_scale'], label='guidance_scale', info='For CFG. 1.5 is a good value.')
+                            shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt', lines=3, elem_classes=['add_scrollbar'])
+                            shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha', info='For Contrastive Search. do_sample must be unchecked.')
+                            shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode', info='mode=1 is for llama.cpp only.')
+                            shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau')
+                            shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta')
+                            shared.gradio['smoothing_factor'] = gr.Slider(0.0, 10.0, value=generate_params['smoothing_factor'], step=0.01, label='smoothing_factor', info='Activates Quadratic Sampling.')
+                            shared.gradio['dynamic_temperature'] = gr.Checkbox(value=generate_params['dynamic_temperature'], label='dynamic_temperature')
+                            shared.gradio['dynatemp_low'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_low'], step=0.01, label='dynatemp_low', visible=generate_params['dynamic_temperature'])
+                            shared.gradio['dynatemp_high'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_high'], step=0.01, label='dynatemp_high', visible=generate_params['dynamic_temperature'])
+                            shared.gradio['dynatemp_exponent'] = gr.Slider(0.01, 5, value=generate_params['dynatemp_exponent'], step=0.01, label='dynatemp_exponent', visible=generate_params['dynamic_temperature'])
+                            shared.gradio['temperature_last'] = gr.Checkbox(value=generate_params['temperature_last'], label='temperature_last', info='Moves temperature/dynamic temperature/quadratic sampling to the end of the sampler stack, ignoring their positions in "Sampler priority".')
+                            shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample')
+                            shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)')
+                            with gr.Accordion('Other parameters', open=False):
+                                shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty')
+                                shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size')
+                                shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'], label='min_length')
+                                shared.gradio['num_beams'] = gr.Slider(1, 20, step=1, value=generate_params['num_beams'], label='num_beams', info='For Beam Search, along with length_penalty and early_stopping.')
+                                shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty')
+                                shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping')
+
+                    gr.Markdown("[Learn more](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab)")
+
+                with gr.Column():
+                    with gr.Row():
+                        with gr.Column():
+                            shared.gradio['truncation_length'] = gr.Slider(value=get_truncation_length(), minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
+                            shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
+                            shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
+                            shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
+
+                            shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
+                            shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Custom token bans', info='Specific token IDs to ban from generating, comma-separated. The IDs can be found in the Default or Notebook tab.')
+
+                        with gr.Column():
+                            shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
+                            shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
+                            shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
+                            shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
+                            shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
+
+                            with gr.Blocks():
+                                shared.gradio['sampler_priority'] = gr.Textbox(value=generate_params['sampler_priority'], lines=12, label='Sampler priority', info='Parameter names separated by new lines or commas.')
+
+                            with gr.Row() as shared.gradio['grammar_file_row']:
+                                shared.gradio['grammar_file'] = gr.Dropdown(value='None', choices=utils.get_available_grammars(), label='Load grammar from file (.gbnf)', elem_classes='slim-dropdown')
+                                ui.create_refresh_button(shared.gradio['grammar_file'], lambda: None, lambda: {'choices': utils.get_available_grammars()}, 'refresh-button', interactive=not mu)
+                                shared.gradio['save_grammar'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu)
+                                shared.gradio['delete_grammar'] = gr.Button('🗑️ ', elem_classes='refresh-button', interactive=not mu)
+
+                    shared.gradio['grammar_string'] = gr.Textbox(value='', label='Grammar', lines=16, elem_classes=['add_scrollbar', 'monospace'])
+
+        ui_chat.create_chat_settings_ui()
+
+
+def create_event_handlers():
+    shared.gradio['filter_by_loader'].change(loaders.blacklist_samplers, gradio('filter_by_loader', 'dynamic_temperature'), gradio(loaders.list_all_samplers()), show_progress=False)
+    shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params()))
+    shared.gradio['random_preset'].click(presets.random_preset, gradio('interface_state'), gradio('interface_state') + gradio(presets.presets_params()))
+    shared.gradio['grammar_file'].change(load_grammar, gradio('grammar_file'), gradio('grammar_string'))
+    shared.gradio['dynamic_temperature'].change(lambda x: [gr.update(visible=x)] * 3, gradio('dynamic_temperature'), gradio('dynatemp_low', 'dynatemp_high', 'dynatemp_exponent'))
+
+
+def get_truncation_length():
+    if 'max_seq_len' in shared.provided_arguments or shared.args.max_seq_len != shared.args_defaults.max_seq_len:
+        return shared.args.max_seq_len
+    elif 'n_ctx' in shared.provided_arguments or shared.args.n_ctx != shared.args_defaults.n_ctx:
+        return shared.args.n_ctx
+    else:
+        return shared.settings['truncation_length']
+
+
+def load_grammar(name):
+    p = Path(f'grammars/{name}')
+    if p.exists():
+        return open(p, 'r', encoding='utf-8').read()
+    else:
+        return ''
diff --git a/modules/ui_session.py b/modules/ui_session.py
new file mode 100644
index 0000000000000000000000000000000000000000..989046eae8d8192610508ac731f9320996d03eda
--- /dev/null
+++ b/modules/ui_session.py
@@ -0,0 +1,74 @@
+import gradio as gr
+
+from modules import shared, ui, utils
+from modules.github import clone_or_pull_repository
+from modules.utils import gradio
+
+
+def create_ui():
+    mu = shared.args.multi_user
+    with gr.Tab("Session", elem_id="session-tab"):
+        with gr.Row():
+            with gr.Column():
+                shared.gradio['reset_interface'] = gr.Button("Apply flags/extensions and restart", interactive=not mu)
+                with gr.Row():
+                    shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡')
+                    shared.gradio['save_settings'] = gr.Button('Save UI defaults to settings.yaml', interactive=not mu)
+
+                with gr.Row():
+                    with gr.Column():
+                        shared.gradio['extensions_menu'] = gr.CheckboxGroup(choices=utils.get_available_extensions(), value=shared.args.extensions, label="Available extensions", info='Note that some of these extensions may require manually installing Python requirements through the command: pip install -r extensions/extension_name/requirements.txt', elem_classes='checkboxgroup-table')
+
+                    with gr.Column():
+                        shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
+
+            with gr.Column():
+                extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
+                extension_status = gr.Markdown()
+
+        shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light')
+        extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
+
+        # Reset interface event
+        shared.gradio['reset_interface'].click(
+            set_interface_arguments, gradio('extensions_menu', 'bool_menu'), None).then(
+            lambda: None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;padding-top:20%;margin:0;height:100vh;color:lightgray;text-align:center;background:var(--body-background-fill)">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500); return []}')
+
+        shared.gradio['toggle_dark_mode'].click(
+            lambda: None, None, None, _js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}').then(
+            lambda x: 'dark' if x == 'light' else 'light', gradio('theme_state'), gradio('theme_state'))
+
+        shared.gradio['save_settings'].click(
+            ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+            ui.save_settings, gradio('interface_state', 'preset_menu', 'extensions_menu', 'show_controls', 'theme_state'), gradio('save_contents')).then(
+            lambda: './', None, gradio('save_root')).then(
+            lambda: 'settings.yaml', None, gradio('save_filename')).then(
+            lambda: gr.update(visible=True), None, gradio('file_saver'))
+
+
+def set_interface_arguments(extensions, bool_active):
+    shared.args.extensions = extensions
+
+    bool_list = get_boolean_arguments()
+
+    for k in bool_list:
+        setattr(shared.args, k, False)
+    for k in bool_active:
+        setattr(shared.args, k, True)
+        if k == 'api':
+            shared.add_extension('openai', last=True)
+
+    shared.need_restart = True
+
+
+def get_boolean_arguments(active=False):
+    exclude = shared.deprecated_args
+
+    cmd_list = vars(shared.args)
+    bool_list = sorted([k for k in cmd_list if type(cmd_list[k]) is bool and k not in exclude + ui.list_model_elements()])
+    bool_active = [k for k in bool_list if vars(shared.args)[k]]
+
+    if active:
+        return bool_active
+    else:
+        return bool_list
diff --git a/modules/utils.py b/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..de6d32dca61c0ebc76395d8360a55b084bb56280
--- /dev/null
+++ b/modules/utils.py
@@ -0,0 +1,134 @@
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+
+from modules import github, shared
+from modules.logging_colors import logger
+
+
+# Helper function to get multiple values from shared.gradio
+def gradio(*keys):
+    if len(keys) == 1 and type(keys[0]) in [list, tuple]:
+        keys = keys[0]
+
+    return [shared.gradio[k] for k in keys]
+
+
+def save_file(fname, contents):
+    if fname == '':
+        logger.error('File name is empty!')
+        return
+
+    root_folder = Path(__file__).resolve().parent.parent
+    abs_path_str = os.path.abspath(fname)
+    rel_path_str = os.path.relpath(abs_path_str, root_folder)
+    rel_path = Path(rel_path_str)
+    if rel_path.parts[0] == '..':
+        logger.error(f'Invalid file path: \"{fname}\"')
+        return
+
+    with open(abs_path_str, 'w', encoding='utf-8') as f:
+        f.write(contents)
+
+    logger.info(f'Saved \"{abs_path_str}\".')
+
+
+def delete_file(fname):
+    if fname == '':
+        logger.error('File name is empty!')
+        return
+
+    root_folder = Path(__file__).resolve().parent.parent
+    abs_path_str = os.path.abspath(fname)
+    rel_path_str = os.path.relpath(abs_path_str, root_folder)
+    rel_path = Path(rel_path_str)
+    if rel_path.parts[0] == '..':
+        logger.error(f'Invalid file path: \"{fname}\"')
+        return
+
+    if rel_path.exists():
+        rel_path.unlink()
+        logger.info(f'Deleted \"{fname}\".')
+
+
+def current_time():
+    return f"{datetime.now().strftime('%Y-%m-%d-%H%M%S')}"
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text.lower()
+
+
+# Replace multiple string pairs in a string
+def replace_all(text, dic):
+    for i, j in dic.items():
+        text = text.replace(i, j)
+
+    return text
+
+
+def natural_keys(text):
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_available_models():
+    model_list = []
+    for item in list(Path(f'{shared.args.model_dir}/').glob('*')):
+        if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml', '.py')) and 'llama-tokenizer' not in item.name:
+            model_list.append(re.sub('.pth$', '', item.name))
+
+    return ['None'] + sorted(model_list, key=natural_keys)
+
+
+def get_available_presets():
+    return sorted(set((k.stem for k in Path('presets').glob('*.yaml'))), key=natural_keys)
+
+
+def get_available_prompts():
+    prompts = []
+    files = set((k.stem for k in Path('prompts').glob('*.txt')))
+    prompts += sorted([k for k in files if re.match('^[0-9]', k)], key=natural_keys, reverse=True)
+    prompts += sorted([k for k in files if re.match('^[^0-9]', k)], key=natural_keys)
+    prompts += ['None']
+    return prompts
+
+
+def get_available_characters():
+    paths = (x for x in Path('characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
+    return sorted(set((k.stem for k in paths)), key=natural_keys)
+
+
+def get_available_instruction_templates():
+    path = "instruction-templates"
+    paths = []
+    if os.path.exists(path):
+        paths = (x for x in Path(path).iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
+
+    return ['Select template to load...'] + sorted(set((k.stem for k in paths)), key=natural_keys)
+
+
+def get_available_extensions():
+    extensions = sorted(set(map(lambda x: x.parts[1], Path('extensions').glob('*/script.py'))), key=natural_keys)
+    extensions = [v for v in extensions if v not in github.new_extensions]
+    return extensions
+
+
+def get_available_loras():
+    return ['None'] + sorted([item.name for item in list(Path(shared.args.lora_dir).glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json'))], key=natural_keys)
+
+
+def get_datasets(path: str, ext: str):
+    # include subdirectories for raw txt files to allow training from a subdirectory of txt files
+    if ext == "txt":
+        return ['None'] + sorted(set([k.stem for k in list(Path(path).glob('*.txt')) + list(Path(path).glob('*/')) if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
+
+    return ['None'] + sorted(set([k.stem for k in Path(path).glob(f'*.{ext}') if k.stem != 'put-trainer-datasets-here']), key=natural_keys)
+
+
+def get_available_chat_styles():
+    return sorted(set(('-'.join(k.stem.split('-')[1:]) for k in Path('css').glob('chat_style*.css'))), key=natural_keys)
+
+
+def get_available_grammars():
+    return ['None'] + sorted([item.name for item in list(Path('grammars').glob('*.gbnf'))], key=natural_keys)
diff --git a/presets/Big O.yaml b/presets/Big O.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ab182687647aaa04868d08024eefaacd0b48b06
--- /dev/null
+++ b/presets/Big O.yaml	
@@ -0,0 +1,6 @@
+temperature: 0.87
+top_p: 0.99
+typical_p: 0.68
+tfs: 0.68
+repetition_penalty: 1.01
+top_k: 85
diff --git a/presets/Contrastive Search.yaml b/presets/Contrastive Search.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d9a47a9f5b75ae5d2db9da0ace7a51d962fdef94
--- /dev/null
+++ b/presets/Contrastive Search.yaml	
@@ -0,0 +1,3 @@
+do_sample: false
+top_k: 4
+penalty_alpha: 0.3
diff --git a/presets/Debug-deterministic.yaml b/presets/Debug-deterministic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45d5953f46ef1c67c72ea13cd180f2a5a606db39
--- /dev/null
+++ b/presets/Debug-deterministic.yaml
@@ -0,0 +1,2 @@
+do_sample: false
+top_k: 1
diff --git a/presets/Devia Preset Gen.yaml b/presets/Devia Preset Gen.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3ed84f4102fde352d9cafc070be784792f19f95
--- /dev/null
+++ b/presets/Devia Preset Gen.yaml	
@@ -0,0 +1,17 @@
+temperature: 0.3
+top_p: 0.5
+top_k: 100
+repetition_penalty: 1.18
+sampler_priority: |-
+  temperature
+  dynamic_temperature
+  quadratic_sampling
+  top_k
+  top_p
+  typical_p
+  epsilon_cutoff
+  eta_cutoff
+  tfs
+  top_a
+  min_p
+  mirostat
diff --git a/presets/Divine Intellect.yaml b/presets/Divine Intellect.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac750e40dc94a7a622a21e692bc399b4e8d9e2df
--- /dev/null
+++ b/presets/Divine Intellect.yaml	
@@ -0,0 +1,4 @@
+temperature: 1.31
+top_p: 0.14
+repetition_penalty: 1.17
+top_k: 49
diff --git a/presets/LLaMA-Precise.yaml b/presets/LLaMA-Precise.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5f9cae25636cafb52a8d99c9d610c1a33a7ef3b
--- /dev/null
+++ b/presets/LLaMA-Precise.yaml
@@ -0,0 +1,4 @@
+temperature: 0.7
+top_p: 0.1
+repetition_penalty: 1.18
+top_k: 40
diff --git a/presets/Midnight Enigma.yaml b/presets/Midnight Enigma.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0bd1763c6d5aab39dd7a25ac69c453a846e93fb1
--- /dev/null
+++ b/presets/Midnight Enigma.yaml	
@@ -0,0 +1,4 @@
+temperature: 0.98
+top_p: 0.37
+repetition_penalty: 1.18
+top_k: 100
diff --git a/presets/My Preset.yaml b/presets/My Preset.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2d1449a1d59bd93db69fe82833615542c2d8fd0
--- /dev/null
+++ b/presets/My Preset.yaml	
@@ -0,0 +1,17 @@
+temperature: 0.7
+top_p: 0.1
+top_k: 40
+repetition_penalty: 1.18
+sampler_priority: |-
+  temperature
+  dynamic_temperature
+  quadratic_sampling
+  top_k
+  top_p
+  typical_p
+  epsilon_cutoff
+  eta_cutoff
+  tfs
+  top_a
+  min_p
+  mirostat
diff --git a/presets/Null preset.yaml b/presets/Null preset.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..714aa9a3ed57177426e4ab415107838066e21fb5
--- /dev/null
+++ b/presets/Null preset.yaml	
@@ -0,0 +1 @@
+temperature: 1
diff --git a/presets/Shortwave.yaml b/presets/Shortwave.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2528abdb4b950defd50c42ef6cbe0f884560e79
--- /dev/null
+++ b/presets/Shortwave.yaml
@@ -0,0 +1,4 @@
+temperature: 1.53
+top_p: 0.64
+repetition_penalty: 1.07
+top_k: 33
diff --git a/presets/Yara.yaml b/presets/Yara.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..87bb019ec62bb7afc395d3212b7560c25c3d36ed
--- /dev/null
+++ b/presets/Yara.yaml
@@ -0,0 +1,4 @@
+temperature: 0.82
+top_p: 0.21
+repetition_penalty: 1.19
+top_k: 72
diff --git a/presets/simple-1.yaml b/presets/simple-1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30a106590b6d70d9c537a8b2b6c26f2a832bd8b9
--- /dev/null
+++ b/presets/simple-1.yaml
@@ -0,0 +1,4 @@
+temperature: 0.7
+top_p: 0.9
+repetition_penalty: 1.15
+top_k: 20
diff --git a/prompts/Alpaca-with-Input.txt b/prompts/Alpaca-with-Input.txt
new file mode 100644
index 0000000000000000000000000000000000000000..56df0e285be9689ab1f8ea698ce748e6d1b02af2
--- /dev/null
+++ b/prompts/Alpaca-with-Input.txt
@@ -0,0 +1,10 @@
+Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+### Instruction:
+Instruction
+
+### Input:
+Input
+
+### Response:
+
diff --git a/prompts/GPT-4chan.txt b/prompts/GPT-4chan.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1bc8c7f4613f982e3dfa367562a764cf5bd4c73b
--- /dev/null
+++ b/prompts/GPT-4chan.txt
@@ -0,0 +1,6 @@
+-----
+--- 865467536
+Hello, AI frens!
+How are you doing on this fine day?
+--- 865467537
+
diff --git a/prompts/QA.txt b/prompts/QA.txt
new file mode 100644
index 0000000000000000000000000000000000000000..32b0e2350f3c0a7f447dcd1aba11d6ae2247e5a8
--- /dev/null
+++ b/prompts/QA.txt
@@ -0,0 +1,4 @@
+Common sense questions and answers
+
+Question: 
+Factual answer: