End of training

Browse files

Files changed (9) hide show

README.md +28 -10
config.json +3 -30
generation_config.json +2 -4
model-00001-of-00002.safetensors +1 -1
model-00002-of-00002.safetensors +1 -1
model.safetensors.index.json +0 -1
tokenizer.json +2 -2
tokenizer_config.json +0 -1
training_args.bin +2 -2

README.md CHANGED Viewed

@@ -1,5 +1,4 @@
 ---
-library_name: transformers
 license: gemma
 base_model: google/gemma-2-2b
 tags:
@@ -18,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [google/gemma-2-2b](https://huggingface.co/google/gemma-2-2b) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Loss: 1.0598
-- Num Input Tokens Seen: 5753060
 ## Model description
@@ -44,21 +43,40 @@ The following hyperparameters were used during training:
 - seed: 0
 - gradient_accumulation_steps: 16
 - total_train_batch_size: 128
-- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: constant_with_warmup
 - lr_scheduler_warmup_ratio: 0.05
 - num_epochs: 1
 ### Training results
-| Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen |
-|:-------------:|:-----:|:----:|:---------------:|:-----------------:|
-| No log        | 0     | 0    | 1.3864          | 0                 |
 ### Framework versions
-- Transformers 4.57.3
-- Pytorch 2.8.0+cu128
 - Datasets 2.20.0
-- Tokenizers 0.22.2

 ---
 license: gemma
 base_model: google/gemma-2-2b
 tags:
 This model is a fine-tuned version of [google/gemma-2-2b](https://huggingface.co/google/gemma-2-2b) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Loss: 1.0637
+- Num Input Tokens Seen: 5698680
 ## Model description
 - seed: 0
 - gradient_accumulation_steps: 16
 - total_train_batch_size: 128
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: constant_with_warmup
 - lr_scheduler_warmup_ratio: 0.05
 - num_epochs: 1
 ### Training results
+| Training Loss | Epoch  | Step | Validation Loss | Input Tokens Seen |
+|:-------------:|:------:|:----:|:---------------:|:-----------------:|
+| No log        | 0      | 0    | 1.3911          | 0                 |
+| 1.3438        | 0.0511 | 5    | 1.2592          | 296352            |
+| 1.1848        | 0.1021 | 10   | 1.1700          | 589152            |
+| 1.1275        | 0.1532 | 15   | 1.1329          | 884504            |
+| 1.0731        | 0.2042 | 20   | 1.1072          | 1182424           |
+| 1.0942        | 0.2553 | 25   | 1.0975          | 1474984           |
+| 1.0931        | 0.3063 | 30   | 1.0918          | 1772592           |
+| 1.1141        | 0.3574 | 35   | 1.0878          | 2061504           |
+| 1.0847        | 0.4084 | 40   | 1.0843          | 2358064           |
+| 1.1003        | 0.4595 | 45   | 1.0811          | 2650896           |
+| 1.0771        | 0.5105 | 50   | 1.0790          | 2942864           |
+| 1.1246        | 0.5616 | 55   | 1.0765          | 3234512           |
+| 1.1009        | 0.6126 | 60   | 1.0744          | 3525376           |
+| 1.0904        | 0.6637 | 65   | 1.0727          | 3820376           |
+| 1.1707        | 0.7147 | 70   | 1.0711          | 4108240           |
+| 1.0279        | 0.7658 | 75   | 1.0692          | 4402208           |
+| 1.1465        | 0.8168 | 80   | 1.0680          | 4698016           |
+| 1.0785        | 0.8679 | 85   | 1.0669          | 4991408           |
+| 1.005         | 0.9190 | 90   | 1.0651          | 5285784           |
+| 1.0613        | 0.9700 | 95   | 1.0641          | 5580576           |
 ### Framework versions
+- Transformers 4.44.0
+- Pytorch 2.4.0+cu121
 - Datasets 2.20.0
+- Tokenizers 0.19.1

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "architectures": [
     "Gemma2ForCausalLM"
   ],
@@ -7,7 +8,6 @@
   "attn_logit_softcapping": 50.0,
   "bos_token_id": 2,
   "cache_implementation": "hybrid",
-  "dtype": "bfloat16",
   "eos_token_id": 1,
   "final_logit_softcapping": 30.0,
   "head_dim": 256,
@@ -16,34 +16,6 @@
   "hidden_size": 2304,
   "initializer_range": 0.02,
   "intermediate_size": 9216,
-  "layer_types": [
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention"
-  ],
   "max_position_embeddings": 8192,
   "model_type": "gemma2",
   "num_attention_heads": 8,
@@ -54,7 +26,8 @@
   "rms_norm_eps": 1e-06,
   "rope_theta": 10000.0,
   "sliding_window": 4096,
-  "transformers_version": "4.57.3",
   "use_cache": true,
   "vocab_size": 256000
 }

 {
+  "_name_or_path": "google/gemma-2-2b",
   "architectures": [
     "Gemma2ForCausalLM"
   ],
   "attn_logit_softcapping": 50.0,
   "bos_token_id": 2,
   "cache_implementation": "hybrid",
   "eos_token_id": 1,
   "final_logit_softcapping": 30.0,
   "head_dim": 256,
   "hidden_size": 2304,
   "initializer_range": 0.02,
   "intermediate_size": 9216,
   "max_position_embeddings": 8192,
   "model_type": "gemma2",
   "num_attention_heads": 8,
   "rms_norm_eps": 1e-06,
   "rope_theta": 10000.0,
   "sliding_window": 4096,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.0",
   "use_cache": true,
   "vocab_size": 256000
 }

generation_config.json CHANGED Viewed

@@ -2,9 +2,7 @@
   "_from_model_config": true,
   "bos_token_id": 2,
   "cache_implementation": "hybrid",
-  "eos_token_id": [
-    1
-  ],
   "pad_token_id": 0,
-  "transformers_version": "4.57.3"
 }

   "_from_model_config": true,
   "bos_token_id": 2,
   "cache_implementation": "hybrid",
+  "eos_token_id": 1,
   "pad_token_id": 0,
+  "transformers_version": "4.44.0"
 }

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8061196e584bf69703b5decf8949f78bddc8f82112f8b8410495f00e100424b8
 size 4988025760

 version https://git-lfs.github.com/spec/v1
+oid sha256:e98bc31b5d2e3707be7c513c25f1107f2f248630f8e02bd6bde9a9629e970129
 size 4988025760

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ec7c9816ef6b48494d3b92814b3a428b6e3bae63531a4f2e3f17dda68b0acd11
 size 240691728

 version https://git-lfs.github.com/spec/v1
+oid sha256:affaf9b17db5dc8a6fc5e35b03a6180fccb7711093cf26d7b437f57402997a0d
 size 240691728

model.safetensors.index.json CHANGED Viewed

@@ -1,6 +1,5 @@
 {
   "metadata": {
-    "total_parameters": 2614341888,
     "total_size": 5228683776
   },
   "weight_map": {

 {
   "metadata": {
     "total_size": 5228683776
   },
   "weight_map": {

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060
-size 34362873

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922
+size 17525357

tokenizer_config.json CHANGED Viewed

@@ -2002,7 +2002,6 @@
   "bos_token": "<bos>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<eos>",
-  "extra_special_tokens": {},
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
   "sp_model_kwargs": {},

   "bos_token": "<bos>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<eos>",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
   "sp_model_kwargs": {},

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9135215cb270dde0a7ed2c4b9f6ba9c67456f2f032cdef826b8c2b54636cdd50
-size 6225

 version https://git-lfs.github.com/spec/v1
+oid sha256:3e6523a7a72444192a0862bac2e582aa1362490b9325175991b004ca9bcab83a
+size 5624