smaller tokenizer and model
Browse files- config.json +1 -1
- make-tiny-deberta.py +28 -45
- merges.txt +0 -0
- pytorch_model.bin +2 -2
- tokenizer.json +0 -0
- vocab.json +0 -0
config.json
CHANGED
|
@@ -29,5 +29,5 @@
|
|
| 29 |
"torch_dtype": "float16",
|
| 30 |
"transformers_version": "4.9.0.dev0",
|
| 31 |
"type_vocab_size": 0,
|
| 32 |
-
"vocab_size":
|
| 33 |
}
|
|
|
|
| 29 |
"torch_dtype": "float16",
|
| 30 |
"transformers_version": "4.9.0.dev0",
|
| 31 |
"type_vocab_size": 0,
|
| 32 |
+
"vocab_size": 5001
|
| 33 |
}
|
make-tiny-deberta.py
CHANGED
|
@@ -74,50 +74,33 @@ mname_tiny = "tiny-deberta"
|
|
| 74 |
|
| 75 |
### Tokenizer
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
# vocab_short_path = f"{tmp_dir}/vocab.json"
|
| 105 |
-
# tokenizer_fast.save_pretrained(tmp_dir)
|
| 106 |
-
# # resize tokenizer.json (vocab.txt will be automatically resized on save_pretrained)
|
| 107 |
-
# # perl -pi -e 's|(2999).*|$1}}}|' tokenizer.json # 0-indexed, so vocab_keep_items-1!
|
| 108 |
-
# closing_pat = "}"
|
| 109 |
-
# cmd = (f"perl -pi -e s|({vocab_keep_items-1}).*|$1{closing_pat}| {tmp_dir}/vocab.json").split()
|
| 110 |
-
# result = subprocess.run(cmd, capture_output=True, text=True)
|
| 111 |
-
# # reload with modified tokenizer
|
| 112 |
-
# #tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(tmp_dir, vocab_file=vocab_short_path)
|
| 113 |
-
# # it seems that ElectraTokenizer is not needed and ElectraTokenizerFast does the job
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
# using full tokenizer for now
|
| 117 |
-
tokenizer_fast_tiny = DebertaTokenizerFast.from_pretrained(mname_orig)
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
|
| 122 |
|
| 123 |
### Config
|
|
@@ -126,7 +109,7 @@ config_tiny = DebertaConfig.from_pretrained(mname_orig)
|
|
| 126 |
print(config_tiny)
|
| 127 |
# remember to update this to the actual config as each model is different and then shrink the numbers
|
| 128 |
config_tiny.update(dict(
|
| 129 |
-
|
| 130 |
embedding_size=32,
|
| 131 |
pooler_size=32,
|
| 132 |
hidden_size=32,
|
|
|
|
| 74 |
|
| 75 |
### Tokenizer
|
| 76 |
|
| 77 |
+
import json
|
| 78 |
+
from transformers import AutoTokenizer
|
| 79 |
+
from tokenizers import Tokenizer
|
| 80 |
+
vocab_keep_items = 5000
|
| 81 |
+
tokenizer = AutoTokenizer.from_pretrained(mname_orig, use_fast=True)
|
| 82 |
+
assert tokenizer.is_fast, "This only works for fast tokenizers."
|
| 83 |
+
tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
|
| 84 |
+
vocab = tokenizer_json["model"]["vocab"]
|
| 85 |
+
if tokenizer_json["model"]["type"] == "BPE":
|
| 86 |
+
new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
|
| 87 |
+
merges = tokenizer_json["model"]["merges"]
|
| 88 |
+
new_merges = []
|
| 89 |
+
for i in range(len(merges)):
|
| 90 |
+
a, b = merges[i].split()
|
| 91 |
+
new_token = "".join((a, b))
|
| 92 |
+
if a in new_vocab and b in new_vocab and new_token in new_vocab:
|
| 93 |
+
new_merges.append(merges[i])
|
| 94 |
+
tokenizer_json["model"]["merges"] = new_merges
|
| 95 |
+
elif tokenizer_json["model"]["type"] == "Unigram":
|
| 96 |
+
new_vocab = vocab[:vocab_keep_items]
|
| 97 |
+
elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel":
|
| 98 |
+
new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
|
| 99 |
+
else:
|
| 100 |
+
raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}")
|
| 101 |
+
tokenizer_json["model"]["vocab"] = new_vocab
|
| 102 |
+
tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
|
| 103 |
+
tokenizer_fast_tiny = tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
### Config
|
|
|
|
| 109 |
print(config_tiny)
|
| 110 |
# remember to update this to the actual config as each model is different and then shrink the numbers
|
| 111 |
config_tiny.update(dict(
|
| 112 |
+
vocab_size=vocab_keep_items,
|
| 113 |
embedding_size=32,
|
| 114 |
pooler_size=32,
|
| 115 |
hidden_size=32,
|
merges.txt
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f071626d5c3781b98f722d52a8e7f1ae7e0df341123f3e764fdf4798d8ca59f
|
| 3 |
+
size 408039
|
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|