Initial model
Browse files- config.json +24 -0
- params.json +16 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- training.log +25 -0
- training_args.bin +3 -0
- vocab.txt +26 -0
config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert_reduced_intermediate_pe_50_epochs_256_bs_5e-05_lr_0.0_warmup_VDJdb_PIRD/checkpoint-16608",
|
3 |
+
"architectures": [
|
4 |
+
"BertForMaskedLM"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"gradient_checkpointing": false,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 768,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 1536,
|
13 |
+
"layer_norm_eps": 1e-12,
|
14 |
+
"max_position_embeddings": 64,
|
15 |
+
"model_type": "bert",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_hidden_layers": 12,
|
18 |
+
"pad_token_id": 21,
|
19 |
+
"position_embedding_type": "absolute",
|
20 |
+
"transformers_version": "4.4.2",
|
21 |
+
"type_vocab_size": 2,
|
22 |
+
"use_cache": true,
|
23 |
+
"vocab_size": 26
|
24 |
+
}
|
params.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"datasets": [
|
3 |
+
"VDJdb",
|
4 |
+
"PIRD"
|
5 |
+
],
|
6 |
+
"bert": "bert",
|
7 |
+
"config": "/home/groups/jamesz/wukevin/projects/tcr/model_configs/bert_reduced_intermediate_pe.json",
|
8 |
+
"outdir": "bert_reduced_intermediate_pe_50_epochs_256_bs_5e-05_lr_0.0_warmup_VDJdb_PIRD",
|
9 |
+
"epochs": 50,
|
10 |
+
"bs": 256,
|
11 |
+
"lr": 5e-05,
|
12 |
+
"warmup": 0.0,
|
13 |
+
"cpu": false,
|
14 |
+
"holdout": 0.1,
|
15 |
+
"noneptune": false
|
16 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e9bad29334de6caeccfa1b3295598c5612fa5fe6ad343f5c08ebca382f9bf85
|
3 |
+
size 229641810
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "?", "sep_token": "|", "pad_token": "$", "cls_token": "*", "mask_token": "."}
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": false, "do_basic_tokenize": true, "never_split": null, "unk_token": "?", "sep_token": "|", "pad_token": "$", "cls_token": "*", "mask_token": ".", "tokenize_chinese_chars": false, "strip_accents": null, "model_max_len": 45, "padding_side": "right"}
|
training.log
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Git commit: 53803a3acd9c7e1115233fff458d2226d7fd0c87
|
2 |
+
PyTorch CUDA version: 10.2
|
3 |
+
Parameter datasets: ['VDJdb', 'PIRD']
|
4 |
+
Parameter bert: bert
|
5 |
+
Parameter config: /home/groups/jamesz/wukevin/projects/tcr/model_configs/bert_reduced_intermediate_pe.json
|
6 |
+
Parameter outdir: bert_reduced_intermediate_pe_50_epochs_256_bs_5e-05_lr_0.0_warmup_VDJdb_PIRD
|
7 |
+
Parameter epochs: 50
|
8 |
+
Parameter bs: 256
|
9 |
+
Parameter lr: 5e-05
|
10 |
+
Parameter warmup: 0.0
|
11 |
+
Parameter cpu: False
|
12 |
+
Parameter holdout: 0.1
|
13 |
+
Parameter noneptune: False
|
14 |
+
Filtering VDJdb species to: ['MusMusculus', 'HomoSapiens']
|
15 |
+
VDJdb: dropping 0 entries for null cdr3 sequence
|
16 |
+
VDJdb: dropping 0 entries for unrecognized AAs
|
17 |
+
PIRD data TRA/TRB instances: Counter({'TRB': 46483, 'TRA': 4019, 'TRA-TRB': 637})
|
18 |
+
PIRD data 0.1655 data labelled with antigen sequence
|
19 |
+
PIRD: Removing 95 entires with non amino acid residues
|
20 |
+
Creating self supervised dataset with 98225 sequences
|
21 |
+
Maximum sequence length: 45
|
22 |
+
Example of tokenized input: CASSQDRGPANEQFF -> [25, 9, 13, 5, 5, 8, 3, 0, 11, 12, 13, 7, 4, 8, 18, 18, 24]
|
23 |
+
Split test with 9822 examples
|
24 |
+
Split train with 88403 examples
|
25 |
+
Loading vanilla BERT model
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb7d0ac4041be09c51113d2dfa606d309921c85720720f22d82983e9a11f06ea
|
3 |
+
size 2415
|
vocab.txt
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
R
|
2 |
+
H
|
3 |
+
K
|
4 |
+
D
|
5 |
+
E
|
6 |
+
S
|
7 |
+
T
|
8 |
+
N
|
9 |
+
Q
|
10 |
+
C
|
11 |
+
U
|
12 |
+
G
|
13 |
+
P
|
14 |
+
A
|
15 |
+
V
|
16 |
+
I
|
17 |
+
L
|
18 |
+
M
|
19 |
+
F
|
20 |
+
Y
|
21 |
+
W
|
22 |
+
$
|
23 |
+
.
|
24 |
+
?
|
25 |
+
|
|
26 |
+
*
|