KrisMinchev commited on
Commit
fa18a9a
·
verified ·
1 Parent(s): 4a19cac

End of training

Browse files
README.md CHANGED
@@ -1,5 +1,4 @@
1
  ---
2
- library_name: transformers
3
  license: gemma
4
  base_model: google/gemma-2-2b
5
  tags:
@@ -18,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [google/gemma-2-2b](https://huggingface.co/google/gemma-2-2b) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 1.0598
22
- - Num Input Tokens Seen: 5753060
23
 
24
  ## Model description
25
 
@@ -44,21 +43,40 @@ The following hyperparameters were used during training:
44
  - seed: 0
45
  - gradient_accumulation_steps: 16
46
  - total_train_batch_size: 128
47
- - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
  - lr_scheduler_type: constant_with_warmup
49
  - lr_scheduler_warmup_ratio: 0.05
50
  - num_epochs: 1
51
 
52
  ### Training results
53
 
54
- | Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen |
55
- |:-------------:|:-----:|:----:|:---------------:|:-----------------:|
56
- | No log | 0 | 0 | 1.3864 | 0 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  ### Framework versions
60
 
61
- - Transformers 4.57.3
62
- - Pytorch 2.8.0+cu128
63
  - Datasets 2.20.0
64
- - Tokenizers 0.22.2
 
1
  ---
 
2
  license: gemma
3
  base_model: google/gemma-2-2b
4
  tags:
 
17
 
18
  This model is a fine-tuned version of [google/gemma-2-2b](https://huggingface.co/google/gemma-2-2b) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 1.0637
21
+ - Num Input Tokens Seen: 5698680
22
 
23
  ## Model description
24
 
 
43
  - seed: 0
44
  - gradient_accumulation_steps: 16
45
  - total_train_batch_size: 128
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: constant_with_warmup
48
  - lr_scheduler_warmup_ratio: 0.05
49
  - num_epochs: 1
50
 
51
  ### Training results
52
 
53
+ | Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen |
54
+ |:-------------:|:------:|:----:|:---------------:|:-----------------:|
55
+ | No log | 0 | 0 | 1.3911 | 0 |
56
+ | 1.3438 | 0.0511 | 5 | 1.2592 | 296352 |
57
+ | 1.1848 | 0.1021 | 10 | 1.1700 | 589152 |
58
+ | 1.1275 | 0.1532 | 15 | 1.1329 | 884504 |
59
+ | 1.0731 | 0.2042 | 20 | 1.1072 | 1182424 |
60
+ | 1.0942 | 0.2553 | 25 | 1.0975 | 1474984 |
61
+ | 1.0931 | 0.3063 | 30 | 1.0918 | 1772592 |
62
+ | 1.1141 | 0.3574 | 35 | 1.0878 | 2061504 |
63
+ | 1.0847 | 0.4084 | 40 | 1.0843 | 2358064 |
64
+ | 1.1003 | 0.4595 | 45 | 1.0811 | 2650896 |
65
+ | 1.0771 | 0.5105 | 50 | 1.0790 | 2942864 |
66
+ | 1.1246 | 0.5616 | 55 | 1.0765 | 3234512 |
67
+ | 1.1009 | 0.6126 | 60 | 1.0744 | 3525376 |
68
+ | 1.0904 | 0.6637 | 65 | 1.0727 | 3820376 |
69
+ | 1.1707 | 0.7147 | 70 | 1.0711 | 4108240 |
70
+ | 1.0279 | 0.7658 | 75 | 1.0692 | 4402208 |
71
+ | 1.1465 | 0.8168 | 80 | 1.0680 | 4698016 |
72
+ | 1.0785 | 0.8679 | 85 | 1.0669 | 4991408 |
73
+ | 1.005 | 0.9190 | 90 | 1.0651 | 5285784 |
74
+ | 1.0613 | 0.9700 | 95 | 1.0641 | 5580576 |
75
 
76
 
77
  ### Framework versions
78
 
79
+ - Transformers 4.44.0
80
+ - Pytorch 2.4.0+cu121
81
  - Datasets 2.20.0
82
+ - Tokenizers 0.19.1
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "architectures": [
3
  "Gemma2ForCausalLM"
4
  ],
@@ -7,7 +8,6 @@
7
  "attn_logit_softcapping": 50.0,
8
  "bos_token_id": 2,
9
  "cache_implementation": "hybrid",
10
- "dtype": "bfloat16",
11
  "eos_token_id": 1,
12
  "final_logit_softcapping": 30.0,
13
  "head_dim": 256,
@@ -16,34 +16,6 @@
16
  "hidden_size": 2304,
17
  "initializer_range": 0.02,
18
  "intermediate_size": 9216,
19
- "layer_types": [
20
- "sliding_attention",
21
- "full_attention",
22
- "sliding_attention",
23
- "full_attention",
24
- "sliding_attention",
25
- "full_attention",
26
- "sliding_attention",
27
- "full_attention",
28
- "sliding_attention",
29
- "full_attention",
30
- "sliding_attention",
31
- "full_attention",
32
- "sliding_attention",
33
- "full_attention",
34
- "sliding_attention",
35
- "full_attention",
36
- "sliding_attention",
37
- "full_attention",
38
- "sliding_attention",
39
- "full_attention",
40
- "sliding_attention",
41
- "full_attention",
42
- "sliding_attention",
43
- "full_attention",
44
- "sliding_attention",
45
- "full_attention"
46
- ],
47
  "max_position_embeddings": 8192,
48
  "model_type": "gemma2",
49
  "num_attention_heads": 8,
@@ -54,7 +26,8 @@
54
  "rms_norm_eps": 1e-06,
55
  "rope_theta": 10000.0,
56
  "sliding_window": 4096,
57
- "transformers_version": "4.57.3",
 
58
  "use_cache": true,
59
  "vocab_size": 256000
60
  }
 
1
  {
2
+ "_name_or_path": "google/gemma-2-2b",
3
  "architectures": [
4
  "Gemma2ForCausalLM"
5
  ],
 
8
  "attn_logit_softcapping": 50.0,
9
  "bos_token_id": 2,
10
  "cache_implementation": "hybrid",
 
11
  "eos_token_id": 1,
12
  "final_logit_softcapping": 30.0,
13
  "head_dim": 256,
 
16
  "hidden_size": 2304,
17
  "initializer_range": 0.02,
18
  "intermediate_size": 9216,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "max_position_embeddings": 8192,
20
  "model_type": "gemma2",
21
  "num_attention_heads": 8,
 
26
  "rms_norm_eps": 1e-06,
27
  "rope_theta": 10000.0,
28
  "sliding_window": 4096,
29
+ "torch_dtype": "bfloat16",
30
+ "transformers_version": "4.44.0",
31
  "use_cache": true,
32
  "vocab_size": 256000
33
  }
generation_config.json CHANGED
@@ -2,9 +2,7 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 2,
4
  "cache_implementation": "hybrid",
5
- "eos_token_id": [
6
- 1
7
- ],
8
  "pad_token_id": 0,
9
- "transformers_version": "4.57.3"
10
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 2,
4
  "cache_implementation": "hybrid",
5
+ "eos_token_id": 1,
 
 
6
  "pad_token_id": 0,
7
+ "transformers_version": "4.44.0"
8
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8061196e584bf69703b5decf8949f78bddc8f82112f8b8410495f00e100424b8
3
  size 4988025760
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e98bc31b5d2e3707be7c513c25f1107f2f248630f8e02bd6bde9a9629e970129
3
  size 4988025760
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec7c9816ef6b48494d3b92814b3a428b6e3bae63531a4f2e3f17dda68b0acd11
3
  size 240691728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:affaf9b17db5dc8a6fc5e35b03a6180fccb7711093cf26d7b437f57402997a0d
3
  size 240691728
model.safetensors.index.json CHANGED
@@ -1,6 +1,5 @@
1
  {
2
  "metadata": {
3
- "total_parameters": 2614341888,
4
  "total_size": 5228683776
5
  },
6
  "weight_map": {
 
1
  {
2
  "metadata": {
 
3
  "total_size": 5228683776
4
  },
5
  "weight_map": {
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060
3
- size 34362873
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922
3
+ size 17525357
tokenizer_config.json CHANGED
@@ -2002,7 +2002,6 @@
2002
  "bos_token": "<bos>",
2003
  "clean_up_tokenization_spaces": false,
2004
  "eos_token": "<eos>",
2005
- "extra_special_tokens": {},
2006
  "model_max_length": 1000000000000000019884624838656,
2007
  "pad_token": "<pad>",
2008
  "sp_model_kwargs": {},
 
2002
  "bos_token": "<bos>",
2003
  "clean_up_tokenization_spaces": false,
2004
  "eos_token": "<eos>",
 
2005
  "model_max_length": 1000000000000000019884624838656,
2006
  "pad_token": "<pad>",
2007
  "sp_model_kwargs": {},
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9135215cb270dde0a7ed2c4b9f6ba9c67456f2f032cdef826b8c2b54636cdd50
3
- size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e6523a7a72444192a0862bac2e582aa1362490b9325175991b004ca9bcab83a
3
+ size 5624