File size: 8,432 Bytes
c447b83 2eac302 1151de3 2eac302 455513d 2eac302 455513d f0b2e2a 455513d 2eac302 455513d 2eac302 455513d b733fa5 a888915 455513d 2eac302 455513d 2eac302 455513d 2eac302 455513d 2eac302 455513d 2eac302 455513d 2eac302 455513d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
---
library_name: transformers
tags: []
---
# Model Card for Model ID
ProtST for binary localization.
## Performance
ProtST-ESM-1b outperforms in most cases.
![image/png](/static-proxy?url=https%3A%2F%2Fcdn-uploads.huggingface.co%2Fproduction%2Fuploads%2F62f0a673f0d40f6aae296b4a%2FZetv-yXrJw5V2eceVBVx7.png%3C%2Fspan%3E)
The following script shows how to finetune ProtST on Gaudi.
## Running script
```diff
from transformers import AutoModel, AutoTokenizer, HfArgumentParser, TrainingArguments, Trainer
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.trainer_pt_utils import get_parameter_names
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from datasets import load_dataset
import functools
import numpy as np
from sklearn.metrics import accuracy_score, matthews_corrcoef
import sys
import torch
import logging
import datasets
import transformers
+ import habana_frameworks.torch
+ from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def create_optimizer(opt_model, lr_ratio=0.1):
head_names = []
for n, p in opt_model.named_parameters():
if "classifier" in n:
head_names.append(n)
else:
p.requires_grad = False
# turn a list of tuple to 2 lists
for n, p in opt_model.named_parameters():
if n in head_names:
assert p.requires_grad
backbone_names = []
for n, p in opt_model.named_parameters():
if n not in head_names and p.requires_grad:
backbone_names.append(n)
# for weight_decay policy, see
# https://github.com/huggingface/transformers/blob/50573c648ae953dcc1b94d663651f07fb02268f4/src/transformers/trainer.py#L947
decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS) # forbidden layer norm
decay_parameters = [name for name in decay_parameters if "bias" not in name]
# training_args.learning_rate
head_decay_parameters = [name for name in head_names if name in decay_parameters]
head_not_decay_parameters = [name for name in head_names if name not in decay_parameters]
# training_args.learning_rate * model_config.lr_ratio
backbone_decay_parameters = [name for name in backbone_names if name in decay_parameters]
backbone_not_decay_parameters = [name for name in backbone_names if name not in decay_parameters]
optimizer_grouped_parameters = [
{
"params": [p for n, p in opt_model.named_parameters() if (n in head_decay_parameters and p.requires_grad)],
"weight_decay": training_args.weight_decay,
"lr": training_args.learning_rate
},
{
"params": [p for n, p in opt_model.named_parameters() if (n in backbone_decay_parameters and p.requires_grad)],
"weight_decay": training_args.weight_decay,
"lr": training_args.learning_rate * lr_ratio
},
{
"params": [p for n, p in opt_model.named_parameters() if (n in head_not_decay_parameters and p.requires_grad)],
"weight_decay": 0.0,
"lr": training_args.learning_rate
},
{
"params": [p for n, p in opt_model.named_parameters() if (n in backbone_not_decay_parameters and p.requires_grad)],
"weight_decay": 0.0,
"lr": training_args.learning_rate * lr_ratio
},
]
- optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+ optimizer_cls, optimizer_kwargs = GaudiTrainer.get_optimizer_cls_and_kwargs(training_args)
optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
return optimizer
def create_scheduler(training_args, optimizer):
from transformers.optimization import get_scheduler
return get_scheduler(
training_args.lr_scheduler_type,
optimizer=optimizer if optimizer is None else optimizer,
num_warmup_steps=training_args.get_warmup_steps(training_args.max_steps),
num_training_steps=training_args.max_steps,
)
def compute_metrics(eval_preds):
probs, labels = eval_preds
preds = np.argmax(probs, axis=-1)
result = {"accuracy": accuracy_score(labels, preds), "mcc": matthews_corrcoef(labels, preds)}
return result
def preprocess_logits_for_metrics(logits, labels):
return torch.softmax(logits, dim=-1)
if __name__ == "__main__":
- device = torch.device("cpu")
+ device = torch.device("hpu")
raw_dataset = load_dataset("mila-intel/ProtST-BinaryLocalization")
model = AutoModel.from_pretrained("mila-intel/protst-esm1b-for-sequential-classification", trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
output_dir = "/home/jiqingfe/protst/protst_2/ProtST-HuggingFace/output_dir/ProtSTModel/default/ESM-1b_PubMedBERT-abs/240123_015856"
training_args = {'output_dir': output_dir, 'overwrite_output_dir': True, 'do_train': True, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, \
'learning_rate': 5e-05, 'weight_decay': 0, 'num_train_epochs': 100, 'max_steps': -1, 'lr_scheduler_type': 'constant', 'do_eval': True, \
'evaluation_strategy': 'epoch', 'per_device_eval_batch_size': 32, 'logging_strategy': 'epoch', 'save_strategy': 'epoch', 'save_steps': 820, \
'dataloader_num_workers': 0, 'run_name': 'downstream_esm1b_localization_fix', 'optim': 'adamw_torch', 'resume_from_checkpoint': False, \
- 'label_names': ['labels'], 'load_best_model_at_end': True, 'metric_for_best_model': 'accuracy', 'bf16': True, "save_total_limit": 3}
+ 'label_names': ['labels'], 'load_best_model_at_end': True, 'metric_for_best_model': 'accuracy', 'bf16': True, "save_total_limit": 3, "use_habana":True, "use_lazy_mode": True, "use_hpu_graphs_for_inference": True}
- training_args = HfArgumentParser(TrainingArguments).parse_dict(training_args, allow_extra_keys=False)[0]
+ training_args = HfArgumentParser(GaudiTrainingArguments).parse_dict(training_args, allow_extra_keys=False)[0]
def tokenize_protein(example, tokenizer=None):
protein_seq = example["prot_seq"]
protein_seq_str = tokenizer(protein_seq, add_special_tokens=True)
example["input_ids"] = protein_seq_str["input_ids"]
example["attention_mask"] = protein_seq_str["attention_mask"]
example["labels"] = example["localization"]
return example
func_tokenize_protein = functools.partial(tokenize_protein, tokenizer=tokenizer)
for split in ["train", "validation", "test"]:
raw_dataset[split] = raw_dataset[split].map(func_tokenize_protein, batched=False, remove_columns=["Unnamed: 0", "prot_seq", "localization"])
- data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", max_length=1024)
transformers.utils.logging.set_verbosity_info()
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
optimizer = create_optimizer(model)
scheduler = create_scheduler(training_args, optimizer)
+ gaudi_config = GaudiConfig()
+ gaudi_config.use_fused_adam = True
+ gaudi_config.use_fused_clip_norm =True
# build trainer
- trainer = Trainer(
+ trainer = GaudiTrainer(
model=model,
+ gaudi_config=gaudi_config,
args=training_args,
train_dataset=raw_dataset["train"],
eval_dataset=raw_dataset["validation"],
data_collator=data_collator,
optimizers=(optimizer, scheduler),
compute_metrics=compute_metrics,
preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)
train_result = trainer.train()
trainer.save_model()
# Saves the tokenizer too for easy upload
tokenizer.save_pretrained(training_args.output_dir)
metrics = train_result.metrics
metrics["train_samples"] = len(raw_dataset["train"])
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
metric = trainer.evaluate(raw_dataset["test"], metric_key_prefix="test")
print("test metric: ", metric)
metric = trainer.evaluate(raw_dataset["validation"], metric_key_prefix="valid")
print("valid metric: ", metric)
```
|