Wietse de Vries
commited on
Commit
·
ff8ab2f
1
Parent(s):
9dbd6c0
add missing char tokens to vocab (with embeddings close to [UNK])
Browse files- config.json +7 -2
- pytorch_model.bin +2 -2
- tf_model.h5 +2 -2
- tokenizer.json +0 -0
- tokenizer_config.json +9 -2
- vocab.txt +74 -1
config.json
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
{
|
|
|
2 |
"architectures": [
|
3 |
"BertForMaskedLM"
|
4 |
],
|
5 |
"attention_probs_dropout_prob": 0.1,
|
|
|
6 |
"hidden_act": "gelu",
|
7 |
"hidden_dropout_prob": 0.1,
|
8 |
"hidden_size": 768,
|
@@ -14,6 +16,9 @@
|
|
14 |
"num_attention_heads": 12,
|
15 |
"num_hidden_layers": 12,
|
16 |
"pad_token_id": 3,
|
|
|
|
|
17 |
"type_vocab_size": 2,
|
18 |
-
"
|
19 |
-
|
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "bert-base-dutch-cased",
|
3 |
"architectures": [
|
4 |
"BertForMaskedLM"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"gradient_checkpointing": false,
|
8 |
"hidden_act": "gelu",
|
9 |
"hidden_dropout_prob": 0.1,
|
10 |
"hidden_size": 768,
|
|
|
16 |
"num_attention_heads": 12,
|
17 |
"num_hidden_layers": 12,
|
18 |
"pad_token_id": 3,
|
19 |
+
"position_embedding_type": "absolute",
|
20 |
+
"transformers_version": "4.5.1",
|
21 |
"type_vocab_size": 2,
|
22 |
+
"use_cache": true,
|
23 |
+
"vocab_size": 30073
|
24 |
+
}
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ffe408c7eea0ffee4c257c6028f8c98146967e3ac3db51dba8e2bc8a4abddf5
|
3 |
+
size 436761702
|
tf_model.h5
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:88cc47b929d21ed816d6ad8d5abea5c06ccae04a5f04f2d6b07da7d212aa18e1
|
3 |
+
size 530923844
|
tokenizer.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
CHANGED
@@ -1,4 +1,11 @@
|
|
1 |
{
|
2 |
"do_lower_case": false,
|
3 |
-
"
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
{
|
2 |
"do_lower_case": false,
|
3 |
+
"unk_token": "[UNK]",
|
4 |
+
"sep_token": "[SEP]",
|
5 |
+
"pad_token": "[PAD]",
|
6 |
+
"cls_token": "[CLS]",
|
7 |
+
"mask_token": "[MASK]",
|
8 |
+
"tokenize_chinese_chars": true,
|
9 |
+
"strip_accents": null,
|
10 |
+
"model_max_length": 512
|
11 |
+
}
|
vocab.txt
CHANGED
@@ -29997,4 +29997,77 @@ zóó
|
|
29997 |
##óók
|
29998 |
##öl
|
29999 |
##ön
|
30000 |
-
##ör
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29997 |
##óók
|
29998 |
##öl
|
29999 |
##ön
|
30000 |
+
##ör
|
30001 |
+
##Q
|
30002 |
+
##X
|
30003 |
+
##Ç
|
30004 |
+
##Ó
|
30005 |
+
##Ô
|
30006 |
+
##Ú
|
30007 |
+
##Û
|
30008 |
+
##Ü
|
30009 |
+
##à
|
30010 |
+
##á
|
30011 |
+
##â
|
30012 |
+
##ä
|
30013 |
+
##ê
|
30014 |
+
##ì
|
30015 |
+
##í
|
30016 |
+
##î
|
30017 |
+
##ñ
|
30018 |
+
##ò
|
30019 |
+
##ô
|
30020 |
+
##ù
|
30021 |
+
##ú
|
30022 |
+
##û
|
30023 |
+
##ü
|
30024 |
+
Q
|
30025 |
+
X
|
30026 |
+
a
|
30027 |
+
c
|
30028 |
+
e
|
30029 |
+
f
|
30030 |
+
g
|
30031 |
+
h
|
30032 |
+
i
|
30033 |
+
j
|
30034 |
+
k
|
30035 |
+
l
|
30036 |
+
m
|
30037 |
+
n
|
30038 |
+
o
|
30039 |
+
p
|
30040 |
+
q
|
30041 |
+
r
|
30042 |
+
s
|
30043 |
+
t
|
30044 |
+
u
|
30045 |
+
x
|
30046 |
+
y
|
30047 |
+
Ç
|
30048 |
+
Ó
|
30049 |
+
Ô
|
30050 |
+
Ú
|
30051 |
+
Û
|
30052 |
+
Ü
|
30053 |
+
à
|
30054 |
+
á
|
30055 |
+
â
|
30056 |
+
ä
|
30057 |
+
è
|
30058 |
+
é
|
30059 |
+
ê
|
30060 |
+
ë
|
30061 |
+
ì
|
30062 |
+
í
|
30063 |
+
î
|
30064 |
+
ï
|
30065 |
+
ñ
|
30066 |
+
ò
|
30067 |
+
ó
|
30068 |
+
ô
|
30069 |
+
ö
|
30070 |
+
ù
|
30071 |
+
ú
|
30072 |
+
û
|
30073 |
+
ü
|