wukevin commited on
Commit
ac9d9f1
·
1 Parent(s): ea18e16

Initial model

Browse files
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert_reduced_intermediate_pe_50_epochs_256_bs_5e-05_lr_0.0_warmup_VDJdb_PIRD/checkpoint-16608",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 1536,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 64,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 21,
19
+ "position_embedding_type": "absolute",
20
+ "transformers_version": "4.4.2",
21
+ "type_vocab_size": 2,
22
+ "use_cache": true,
23
+ "vocab_size": 26
24
+ }
params.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "datasets": [
3
+ "VDJdb",
4
+ "PIRD"
5
+ ],
6
+ "bert": "bert",
7
+ "config": "/home/groups/jamesz/wukevin/projects/tcr/model_configs/bert_reduced_intermediate_pe.json",
8
+ "outdir": "bert_reduced_intermediate_pe_50_epochs_256_bs_5e-05_lr_0.0_warmup_VDJdb_PIRD",
9
+ "epochs": 50,
10
+ "bs": 256,
11
+ "lr": 5e-05,
12
+ "warmup": 0.0,
13
+ "cpu": false,
14
+ "holdout": 0.1,
15
+ "noneptune": false
16
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e9bad29334de6caeccfa1b3295598c5612fa5fe6ad343f5c08ebca382f9bf85
3
+ size 229641810
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "?", "sep_token": "|", "pad_token": "$", "cls_token": "*", "mask_token": "."}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "do_basic_tokenize": true, "never_split": null, "unk_token": "?", "sep_token": "|", "pad_token": "$", "cls_token": "*", "mask_token": ".", "tokenize_chinese_chars": false, "strip_accents": null, "model_max_len": 45, "padding_side": "right"}
training.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Git commit: 53803a3acd9c7e1115233fff458d2226d7fd0c87
2
+ PyTorch CUDA version: 10.2
3
+ Parameter datasets: ['VDJdb', 'PIRD']
4
+ Parameter bert: bert
5
+ Parameter config: /home/groups/jamesz/wukevin/projects/tcr/model_configs/bert_reduced_intermediate_pe.json
6
+ Parameter outdir: bert_reduced_intermediate_pe_50_epochs_256_bs_5e-05_lr_0.0_warmup_VDJdb_PIRD
7
+ Parameter epochs: 50
8
+ Parameter bs: 256
9
+ Parameter lr: 5e-05
10
+ Parameter warmup: 0.0
11
+ Parameter cpu: False
12
+ Parameter holdout: 0.1
13
+ Parameter noneptune: False
14
+ Filtering VDJdb species to: ['MusMusculus', 'HomoSapiens']
15
+ VDJdb: dropping 0 entries for null cdr3 sequence
16
+ VDJdb: dropping 0 entries for unrecognized AAs
17
+ PIRD data TRA/TRB instances: Counter({'TRB': 46483, 'TRA': 4019, 'TRA-TRB': 637})
18
+ PIRD data 0.1655 data labelled with antigen sequence
19
+ PIRD: Removing 95 entires with non amino acid residues
20
+ Creating self supervised dataset with 98225 sequences
21
+ Maximum sequence length: 45
22
+ Example of tokenized input: CASSQDRGPANEQFF -> [25, 9, 13, 5, 5, 8, 3, 0, 11, 12, 13, 7, 4, 8, 18, 18, 24]
23
+ Split test with 9822 examples
24
+ Split train with 88403 examples
25
+ Loading vanilla BERT model
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb7d0ac4041be09c51113d2dfa606d309921c85720720f22d82983e9a11f06ea
3
+ size 2415
vocab.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ R
2
+ H
3
+ K
4
+ D
5
+ E
6
+ S
7
+ T
8
+ N
9
+ Q
10
+ C
11
+ U
12
+ G
13
+ P
14
+ A
15
+ V
16
+ I
17
+ L
18
+ M
19
+ F
20
+ Y
21
+ W
22
+ $
23
+ .
24
+ ?
25
+ |
26
+ *