File size: 8,432 Bytes
c447b83
 
 
 
 
 
 
2eac302
 
1151de3
 
 
 
2eac302
455513d
 
2eac302
455513d
f0b2e2a
455513d
 
 
 
 
 
 
 
 
 
 
2eac302
 
 
455513d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2eac302
 
455513d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b733fa5
a888915
455513d
 
 
 
 
 
2eac302
 
 
 
455513d
 
 
 
 
 
 
2eac302
 
 
 
455513d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2eac302
 
455513d
 
 
 
 
 
 
 
2eac302
 
 
 
 
455513d
2eac302
 
455513d
2eac302
455513d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
---
library_name: transformers
tags: []
---

# Model Card for Model ID

ProtST for binary localization.

## Performance
ProtST-ESM-1b outperforms in most cases.
![image/png](/static-proxy?url=https%3A%2F%2Fcdn-uploads.huggingface.co%2Fproduction%2Fuploads%2F62f0a673f0d40f6aae296b4a%2FZetv-yXrJw5V2eceVBVx7.png%3C%2Fspan%3E)

The following script shows how to finetune ProtST on Gaudi.

## Running script
```diff
from transformers import AutoModel, AutoTokenizer, HfArgumentParser, TrainingArguments, Trainer
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.trainer_pt_utils import get_parameter_names
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from datasets import load_dataset
import functools
import numpy as np
from sklearn.metrics import accuracy_score, matthews_corrcoef
import sys
import torch
import logging
import datasets
import transformers
+ import habana_frameworks.torch
+ from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def create_optimizer(opt_model, lr_ratio=0.1):
    head_names = []
    for n, p in opt_model.named_parameters():
        if "classifier" in n:
            head_names.append(n)
        else:
            p.requires_grad = False
    # turn a list of tuple to 2 lists
    for n, p in opt_model.named_parameters():
        if n in head_names:
            assert p.requires_grad
    backbone_names = []
    for n, p in opt_model.named_parameters():
        if n not in head_names and p.requires_grad:
            backbone_names.append(n)
    # for weight_decay policy, see 
    # https://github.com/huggingface/transformers/blob/50573c648ae953dcc1b94d663651f07fb02268f4/src/transformers/trainer.py#L947
    decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS) # forbidden layer norm
    decay_parameters = [name for name in decay_parameters if "bias" not in name]
    # training_args.learning_rate
    head_decay_parameters = [name for name in head_names if name in decay_parameters]
    head_not_decay_parameters = [name for name in head_names if name not in decay_parameters]
    # training_args.learning_rate * model_config.lr_ratio
    backbone_decay_parameters = [name for name in backbone_names if name in decay_parameters]
    backbone_not_decay_parameters = [name for name in backbone_names if name not in decay_parameters]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in opt_model.named_parameters() if (n in head_decay_parameters and p.requires_grad)],
            "weight_decay": training_args.weight_decay,
            "lr": training_args.learning_rate
        },
        {
            "params": [p for n, p in opt_model.named_parameters() if (n in backbone_decay_parameters and p.requires_grad)],
            "weight_decay": training_args.weight_decay,
            "lr": training_args.learning_rate * lr_ratio
        },
        {
            "params": [p for n, p in opt_model.named_parameters() if (n in head_not_decay_parameters and p.requires_grad)],
            "weight_decay": 0.0,
            "lr": training_args.learning_rate
        },
        {
            "params": [p for n, p in opt_model.named_parameters() if (n in backbone_not_decay_parameters and p.requires_grad)],
            "weight_decay": 0.0,
            "lr": training_args.learning_rate * lr_ratio
        },
    ]
-   optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args)
+   optimizer_cls, optimizer_kwargs = GaudiTrainer.get_optimizer_cls_and_kwargs(training_args)
    optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)

    return optimizer

def create_scheduler(training_args, optimizer):
    from transformers.optimization import get_scheduler
    return get_scheduler(
            training_args.lr_scheduler_type,
            optimizer=optimizer if optimizer is None else optimizer,
            num_warmup_steps=training_args.get_warmup_steps(training_args.max_steps),
            num_training_steps=training_args.max_steps,
        )

def compute_metrics(eval_preds):
    probs, labels = eval_preds
    preds = np.argmax(probs, axis=-1)
    result = {"accuracy": accuracy_score(labels, preds), "mcc": matthews_corrcoef(labels, preds)}
    return result

def preprocess_logits_for_metrics(logits, labels):
    return torch.softmax(logits, dim=-1)


if __name__ == "__main__":
-   device = torch.device("cpu")
+   device = torch.device("hpu")
    raw_dataset = load_dataset("mila-intel/ProtST-BinaryLocalization")
    model = AutoModel.from_pretrained("mila-intel/protst-esm1b-for-sequential-classification", trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
    tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")

    output_dir = "/home/jiqingfe/protst/protst_2/ProtST-HuggingFace/output_dir/ProtSTModel/default/ESM-1b_PubMedBERT-abs/240123_015856"
    training_args = {'output_dir': output_dir, 'overwrite_output_dir': True, 'do_train': True, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, \
                     'learning_rate': 5e-05, 'weight_decay': 0, 'num_train_epochs': 100, 'max_steps': -1, 'lr_scheduler_type': 'constant', 'do_eval': True, \
                     'evaluation_strategy': 'epoch', 'per_device_eval_batch_size': 32, 'logging_strategy': 'epoch', 'save_strategy': 'epoch', 'save_steps': 820, \
                     'dataloader_num_workers': 0, 'run_name': 'downstream_esm1b_localization_fix', 'optim': 'adamw_torch', 'resume_from_checkpoint': False, \
-                    'label_names': ['labels'], 'load_best_model_at_end': True, 'metric_for_best_model': 'accuracy', 'bf16': True, "save_total_limit": 3}
+                    'label_names': ['labels'], 'load_best_model_at_end': True, 'metric_for_best_model': 'accuracy', 'bf16': True, "save_total_limit": 3, "use_habana":True, "use_lazy_mode": True, "use_hpu_graphs_for_inference": True}
-   training_args = HfArgumentParser(TrainingArguments).parse_dict(training_args, allow_extra_keys=False)[0]
+   training_args = HfArgumentParser(GaudiTrainingArguments).parse_dict(training_args, allow_extra_keys=False)[0]

    def tokenize_protein(example, tokenizer=None):
        protein_seq = example["prot_seq"]
        protein_seq_str = tokenizer(protein_seq, add_special_tokens=True)
        example["input_ids"] = protein_seq_str["input_ids"]
        example["attention_mask"] = protein_seq_str["attention_mask"]
        example["labels"] = example["localization"]

        return example

    func_tokenize_protein = functools.partial(tokenize_protein, tokenizer=tokenizer)

    for split in ["train", "validation", "test"]:
        raw_dataset[split] = raw_dataset[split].map(func_tokenize_protein, batched=False, remove_columns=["Unnamed: 0", "prot_seq", "localization"])

-   data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+   data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", max_length=1024)

    transformers.utils.logging.set_verbosity_info()
    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)

    optimizer = create_optimizer(model)
    scheduler = create_scheduler(training_args, optimizer)

+   gaudi_config = GaudiConfig()
+   gaudi_config.use_fused_adam = True
+   gaudi_config.use_fused_clip_norm =True


    # build trainer
-   trainer = Trainer(
+   trainer = GaudiTrainer(
        model=model,
+       gaudi_config=gaudi_config,
        args=training_args,
        train_dataset=raw_dataset["train"],
        eval_dataset=raw_dataset["validation"],
        data_collator=data_collator,
        optimizers=(optimizer, scheduler),
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )

    train_result = trainer.train()

    trainer.save_model()
    # Saves the tokenizer too for easy upload
    tokenizer.save_pretrained(training_args.output_dir)

    metrics = train_result.metrics
    metrics["train_samples"] = len(raw_dataset["train"])

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    metric = trainer.evaluate(raw_dataset["test"], metric_key_prefix="test")
    print("test metric: ", metric)

    metric = trainer.evaluate(raw_dataset["validation"], metric_key_prefix="valid")
    print("valid metric: ", metric)
```