tahamajs commited on Sep 8

Commit

8f52f5d

verified ·

1 Parent(s): c2a213a

Upload folder using huggingface_hub

Browse files

Files changed (33) hide show

.gitattributes +2 -0
checkpoint-200/README.md +208 -0
checkpoint-200/adapter_config.json +41 -0
checkpoint-200/adapter_model.safetensors +3 -0
checkpoint-200/added_tokens.json +31 -0
checkpoint-200/chat_template.jinja +89 -0
checkpoint-200/merges.txt +0 -0
checkpoint-200/optimizer.pt +3 -0
checkpoint-200/rng_state.pth +3 -0
checkpoint-200/scaler.pt +3 -0
checkpoint-200/scheduler.pt +3 -0
checkpoint-200/special_tokens_map.json +39 -0
checkpoint-200/tokenizer.json +3 -0
checkpoint-200/tokenizer_config.json +254 -0
checkpoint-200/trainer_state.json +734 -0
checkpoint-200/training_args.bin +3 -0
checkpoint-200/vocab.json +0 -0
checkpoint-400/README.md +208 -0
checkpoint-400/adapter_config.json +41 -0
checkpoint-400/adapter_model.safetensors +3 -0
checkpoint-400/added_tokens.json +31 -0
checkpoint-400/chat_template.jinja +89 -0
checkpoint-400/merges.txt +0 -0
checkpoint-400/optimizer.pt +3 -0
checkpoint-400/rng_state.pth +3 -0
checkpoint-400/scaler.pt +3 -0
checkpoint-400/scheduler.pt +3 -0
checkpoint-400/special_tokens_map.json +39 -0
checkpoint-400/tokenizer.json +3 -0
checkpoint-400/tokenizer_config.json +254 -0
checkpoint-400/trainer_state.json +1434 -0
checkpoint-400/training_args.bin +3 -0
checkpoint-400/vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoint-200/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: ./Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:./Qwen3-8B
+- lora
+- transformers
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.16.0

checkpoint-200/adapter_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "gate_proj",
+    "v_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": true
+}

checkpoint-200/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:811e1dfe35c16127d3af080f2f847210342c66bb7b7546edc73cb14fff0620ab
+size 2834238032

checkpoint-200/added_tokens.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|analysis|>": 151670,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|forecast|>": 151671,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|response|>": 151669,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-200/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

checkpoint-200/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4f47da4233f9b71711708833130a5cea206c2b248f94e61f090e0ae68b75c3f
+size 698777675

checkpoint-200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
+size 14645

checkpoint-200/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70e7b501a7d2958d572a2f106018cb49bba6dc8bb1bdb67812e718653166f4b1
+size 1383

checkpoint-200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20fd17baa36963215dde4e90060899279a307587e71a140f36e9eb0ca498176f
+size 1465

checkpoint-200/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|analysis|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|forecast|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-200/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77247e5fb2e966d04e513068b17cca472e105e7c56953e9b1d27d70b93d77e6f
+size 11423221

checkpoint-200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,254 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|analysis|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|forecast|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|response|>",
+    "<|analysis|>",
+    "<|forecast|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 40960,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,734 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6557377049180327,
+  "eval_steps": 500,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006557377049180328,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0,
+      "loss": 3.1428,
+      "step": 2
+    },
+    {
+      "epoch": 0.013114754098360656,
+      "grad_norm": 30.30763816833496,
+      "learning_rate": 3.278688524590164e-07,
+      "loss": 3.408,
+      "step": 4
+    },
+    {
+      "epoch": 0.019672131147540985,
+      "grad_norm": 44.01264953613281,
+      "learning_rate": 9.836065573770493e-07,
+      "loss": 3.1786,
+      "step": 6
+    },
+    {
+      "epoch": 0.02622950819672131,
+      "grad_norm": 83.83026123046875,
+      "learning_rate": 1.3114754098360657e-06,
+      "loss": 4.3409,
+      "step": 8
+    },
+    {
+      "epoch": 0.03278688524590164,
+      "grad_norm": 28.788864135742188,
+      "learning_rate": 1.9672131147540985e-06,
+      "loss": 3.6102,
+      "step": 10
+    },
+    {
+      "epoch": 0.03934426229508197,
+      "grad_norm": 49.386905670166016,
+      "learning_rate": 2.6229508196721314e-06,
+      "loss": 3.6219,
+      "step": 12
+    },
+    {
+      "epoch": 0.04590163934426229,
+      "grad_norm": 34.787452697753906,
+      "learning_rate": 3.2786885245901638e-06,
+      "loss": 3.2803,
+      "step": 14
+    },
+    {
+      "epoch": 0.05245901639344262,
+      "grad_norm": 18.163414001464844,
+      "learning_rate": 3.934426229508197e-06,
+      "loss": 2.812,
+      "step": 16
+    },
+    {
+      "epoch": 0.05901639344262295,
+      "grad_norm": 22.347946166992188,
+      "learning_rate": 4.59016393442623e-06,
+      "loss": 2.5893,
+      "step": 18
+    },
+    {
+      "epoch": 0.06557377049180328,
+      "grad_norm": 14.794243812561035,
+      "learning_rate": 5.245901639344263e-06,
+      "loss": 2.3009,
+      "step": 20
+    },
+    {
+      "epoch": 0.07213114754098361,
+      "grad_norm": 20.02682113647461,
+      "learning_rate": 5.9016393442622956e-06,
+      "loss": 2.213,
+      "step": 22
+    },
+    {
+      "epoch": 0.07868852459016394,
+      "grad_norm": 5.247424602508545,
+      "learning_rate": 6.5573770491803276e-06,
+      "loss": 1.9407,
+      "step": 24
+    },
+    {
+      "epoch": 0.08524590163934426,
+      "grad_norm": NaN,
+      "learning_rate": 7.213114754098361e-06,
+      "loss": 1.8539,
+      "step": 26
+    },
+    {
+      "epoch": 0.09180327868852459,
+      "grad_norm": 3.0204479694366455,
+      "learning_rate": 7.540983606557377e-06,
+      "loss": 1.8437,
+      "step": 28
+    },
+    {
+      "epoch": 0.09836065573770492,
+      "grad_norm": 1.8878525495529175,
+      "learning_rate": 8.19672131147541e-06,
+      "loss": 1.7493,
+      "step": 30
+    },
+    {
+      "epoch": 0.10491803278688525,
+      "grad_norm": 1.5446749925613403,
+      "learning_rate": 8.852459016393443e-06,
+      "loss": 1.6798,
+      "step": 32
+    },
+    {
+      "epoch": 0.11147540983606558,
+      "grad_norm": 1.1978412866592407,
+      "learning_rate": 9.508196721311476e-06,
+      "loss": 1.6628,
+      "step": 34
+    },
+    {
+      "epoch": 0.1180327868852459,
+      "grad_norm": 0.9035641551017761,
+      "learning_rate": 1.0163934426229509e-05,
+      "loss": 1.5871,
+      "step": 36
+    },
+    {
+      "epoch": 0.12459016393442623,
+      "grad_norm": 0.765148401260376,
+      "learning_rate": 1.0819672131147544e-05,
+      "loss": 1.5404,
+      "step": 38
+    },
+    {
+      "epoch": 0.13114754098360656,
+      "grad_norm": 0.6516129970550537,
+      "learning_rate": 1.1475409836065575e-05,
+      "loss": 1.5619,
+      "step": 40
+    },
+    {
+      "epoch": 0.1377049180327869,
+      "grad_norm": NaN,
+      "learning_rate": 1.2131147540983608e-05,
+      "loss": 1.5199,
+      "step": 42
+    },
+    {
+      "epoch": 0.14426229508196722,
+      "grad_norm": 0.6104413866996765,
+      "learning_rate": 1.2459016393442624e-05,
+      "loss": 1.5179,
+      "step": 44
+    },
+    {
+      "epoch": 0.15081967213114755,
+      "grad_norm": 0.5764546990394592,
+      "learning_rate": 1.3114754098360655e-05,
+      "loss": 1.5158,
+      "step": 46
+    },
+    {
+      "epoch": 0.15737704918032788,
+      "grad_norm": 0.5812691450119019,
+      "learning_rate": 1.377049180327869e-05,
+      "loss": 1.4483,
+      "step": 48
+    },
+    {
+      "epoch": 0.16393442622950818,
+      "grad_norm": 0.5719778537750244,
+      "learning_rate": 1.4426229508196722e-05,
+      "loss": 1.4643,
+      "step": 50
+    },
+    {
+      "epoch": 0.17049180327868851,
+      "grad_norm": 0.571936845779419,
+      "learning_rate": 1.5081967213114754e-05,
+      "loss": 1.4129,
+      "step": 52
+    },
+    {
+      "epoch": 0.17704918032786884,
+      "grad_norm": 0.5835412740707397,
+      "learning_rate": 1.5737704918032788e-05,
+      "loss": 1.4049,
+      "step": 54
+    },
+    {
+      "epoch": 0.18360655737704917,
+      "grad_norm": 0.5831025242805481,
+      "learning_rate": 1.639344262295082e-05,
+      "loss": 1.3912,
+      "step": 56
+    },
+    {
+      "epoch": 0.1901639344262295,
+      "grad_norm": 0.594451367855072,
+      "learning_rate": 1.7049180327868854e-05,
+      "loss": 1.334,
+      "step": 58
+    },
+    {
+      "epoch": 0.19672131147540983,
+      "grad_norm": 0.6067811846733093,
+      "learning_rate": 1.7704918032786887e-05,
+      "loss": 1.3164,
+      "step": 60
+    },
+    {
+      "epoch": 0.20327868852459016,
+      "grad_norm": 1.0636777877807617,
+      "learning_rate": 1.836065573770492e-05,
+      "loss": 1.3004,
+      "step": 62
+    },
+    {
+      "epoch": 0.2098360655737705,
+      "grad_norm": 0.6091246008872986,
+      "learning_rate": 1.9016393442622952e-05,
+      "loss": 1.2634,
+      "step": 64
+    },
+    {
+      "epoch": 0.21639344262295082,
+      "grad_norm": 0.5987696051597595,
+      "learning_rate": 1.9672131147540985e-05,
+      "loss": 1.226,
+      "step": 66
+    },
+    {
+      "epoch": 0.22295081967213115,
+      "grad_norm": 0.6262068152427673,
+      "learning_rate": 1.9999963263091053e-05,
+      "loss": 1.2714,
+      "step": 68
+    },
+    {
+      "epoch": 0.22950819672131148,
+      "grad_norm": 0.5762652158737183,
+      "learning_rate": 1.9999669369438976e-05,
+      "loss": 1.2161,
+      "step": 70
+    },
+    {
+      "epoch": 0.2360655737704918,
+      "grad_norm": 0.5977967381477356,
+      "learning_rate": 1.99990815907722e-05,
+      "loss": 1.2051,
+      "step": 72
+    },
+    {
+      "epoch": 0.24262295081967214,
+      "grad_norm": 0.5854564905166626,
+      "learning_rate": 1.9998199944365234e-05,
+      "loss": 1.1852,
+      "step": 74
+    },
+    {
+      "epoch": 0.24918032786885247,
+      "grad_norm": 0.6072487235069275,
+      "learning_rate": 1.9997024456129198e-05,
+      "loss": 1.1669,
+      "step": 76
+    },
+    {
+      "epoch": 0.25573770491803277,
+      "grad_norm": 0.6043636202812195,
+      "learning_rate": 1.9995555160611073e-05,
+      "loss": 1.1619,
+      "step": 78
+    },
+    {
+      "epoch": 0.26229508196721313,
+      "grad_norm": 0.6353415250778198,
+      "learning_rate": 1.9993792100992683e-05,
+      "loss": 1.189,
+      "step": 80
+    },
+    {
+      "epoch": 0.26885245901639343,
+      "grad_norm": 0.5906977653503418,
+      "learning_rate": 1.9991735329089416e-05,
+      "loss": 1.1305,
+      "step": 82
+    },
+    {
+      "epoch": 0.2754098360655738,
+      "grad_norm": 0.6816039681434631,
+      "learning_rate": 1.9989384905348718e-05,
+      "loss": 1.1603,
+      "step": 84
+    },
+    {
+      "epoch": 0.2819672131147541,
+      "grad_norm": 0.6609848141670227,
+      "learning_rate": 1.9986740898848306e-05,
+      "loss": 1.1291,
+      "step": 86
+    },
+    {
+      "epoch": 0.28852459016393445,
+      "grad_norm": 0.6140775680541992,
+      "learning_rate": 1.9983803387294138e-05,
+      "loss": 1.1099,
+      "step": 88
+    },
+    {
+      "epoch": 0.29508196721311475,
+      "grad_norm": 1.0536493062973022,
+      "learning_rate": 1.9980572457018124e-05,
+      "loss": 1.1065,
+      "step": 90
+    },
+    {
+      "epoch": 0.3016393442622951,
+      "grad_norm": 0.6520818471908569,
+      "learning_rate": 1.997704820297561e-05,
+      "loss": 1.0993,
+      "step": 92
+    },
+    {
+      "epoch": 0.3081967213114754,
+      "grad_norm": 0.6376558542251587,
+      "learning_rate": 1.9973230728742563e-05,
+      "loss": 1.1249,
+      "step": 94
+    },
+    {
+      "epoch": 0.31475409836065577,
+      "grad_norm": 0.6940956115722656,
+      "learning_rate": 1.9969120146512542e-05,
+      "loss": 1.0645,
+      "step": 96
+    },
+    {
+      "epoch": 0.32131147540983607,
+      "grad_norm": 0.669341504573822,
+      "learning_rate": 1.996471657709339e-05,
+      "loss": 1.0702,
+      "step": 98
+    },
+    {
+      "epoch": 0.32786885245901637,
+      "grad_norm": 0.6368036866188049,
+      "learning_rate": 1.9960020149903693e-05,
+      "loss": 1.0681,
+      "step": 100
+    },
+    {
+      "epoch": 0.3344262295081967,
+      "grad_norm": 0.7006358504295349,
+      "learning_rate": 1.9955031002968972e-05,
+      "loss": 1.0446,
+      "step": 102
+    },
+    {
+      "epoch": 0.34098360655737703,
+      "grad_norm": 0.6593023538589478,
+      "learning_rate": 1.9949749282917628e-05,
+      "loss": 1.0522,
+      "step": 104
+    },
+    {
+      "epoch": 0.3475409836065574,
+      "grad_norm": 0.6703020334243774,
+      "learning_rate": 1.994417514497663e-05,
+      "loss": 1.0485,
+      "step": 106
+    },
+    {
+      "epoch": 0.3540983606557377,
+      "grad_norm": 0.7253920435905457,
+      "learning_rate": 1.9938308752966957e-05,
+      "loss": 1.0748,
+      "step": 108
+    },
+    {
+      "epoch": 0.36065573770491804,
+      "grad_norm": 0.7168115377426147,
+      "learning_rate": 1.993215027929878e-05,
+      "loss": 1.0431,
+      "step": 110
+    },
+    {
+      "epoch": 0.36721311475409835,
+      "grad_norm": 0.7738324999809265,
+      "learning_rate": 1.992569990496639e-05,
+      "loss": 1.0494,
+      "step": 112
+    },
+    {
+      "epoch": 0.3737704918032787,
+      "grad_norm": 0.7249582409858704,
+      "learning_rate": 1.9918957819542895e-05,
+      "loss": 1.0335,
+      "step": 114
+    },
+    {
+      "epoch": 0.380327868852459,
+      "grad_norm": 0.7557644248008728,
+      "learning_rate": 1.9911924221174638e-05,
+      "loss": 1.0451,
+      "step": 116
+    },
+    {
+      "epoch": 0.38688524590163936,
+      "grad_norm": 0.7169610261917114,
+      "learning_rate": 1.990459931657536e-05,
+      "loss": 1.0057,
+      "step": 118
+    },
+    {
+      "epoch": 0.39344262295081966,
+      "grad_norm": 0.7137094736099243,
+      "learning_rate": 1.989698332102015e-05,
+      "loss": 1.0099,
+      "step": 120
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.7775920629501343,
+      "learning_rate": 1.9889076458339116e-05,
+      "loss": 0.9981,
+      "step": 122
+    },
+    {
+      "epoch": 0.4065573770491803,
+      "grad_norm": 0.7513145804405212,
+      "learning_rate": 1.9880878960910772e-05,
+      "loss": 0.9943,
+      "step": 124
+    },
+    {
+      "epoch": 0.4131147540983607,
+      "grad_norm": 0.7948256134986877,
+      "learning_rate": 1.9872391069655258e-05,
+      "loss": 0.9812,
+      "step": 126
+    },
+    {
+      "epoch": 0.419672131147541,
+      "grad_norm": 1.0542031526565552,
+      "learning_rate": 1.9863613034027224e-05,
+      "loss": 1.0288,
+      "step": 128
+    },
+    {
+      "epoch": 0.4262295081967213,
+      "grad_norm": 0.7674461603164673,
+      "learning_rate": 1.9854545112008514e-05,
+      "loss": 0.9947,
+      "step": 130
+    },
+    {
+      "epoch": 0.43278688524590164,
+      "grad_norm": 0.7810584306716919,
+      "learning_rate": 1.9845187570100576e-05,
+      "loss": 1.0351,
+      "step": 132
+    },
+    {
+      "epoch": 0.43934426229508194,
+      "grad_norm": 0.745934247970581,
+      "learning_rate": 1.983554068331664e-05,
+      "loss": 0.995,
+      "step": 134
+    },
+    {
+      "epoch": 0.4459016393442623,
+      "grad_norm": 0.8037360906600952,
+      "learning_rate": 1.982560473517362e-05,
+      "loss": 0.978,
+      "step": 136
+    },
+    {
+      "epoch": 0.4524590163934426,
+      "grad_norm": 0.8278532028198242,
+      "learning_rate": 1.9815380017683804e-05,
+      "loss": 0.9646,
+      "step": 138
+    },
+    {
+      "epoch": 0.45901639344262296,
+      "grad_norm": 0.8025350570678711,
+      "learning_rate": 1.9804866831346254e-05,
+      "loss": 0.9976,
+      "step": 140
+    },
+    {
+      "epoch": 0.46557377049180326,
+      "grad_norm": 0.7931703925132751,
+      "learning_rate": 1.9794065485137973e-05,
+      "loss": 0.9558,
+      "step": 142
+    },
+    {
+      "epoch": 0.4721311475409836,
+      "grad_norm": 0.8693389892578125,
+      "learning_rate": 1.9782976296504833e-05,
+      "loss": 0.9804,
+      "step": 144
+    },
+    {
+      "epoch": 0.4786885245901639,
+      "grad_norm": 0.9033321738243103,
+      "learning_rate": 1.9771599591352254e-05,
+      "loss": 0.9983,
+      "step": 146
+    },
+    {
+      "epoch": 0.4852459016393443,
+      "grad_norm": 0.8326368927955627,
+      "learning_rate": 1.97599357040356e-05,
+      "loss": 0.9614,
+      "step": 148
+    },
+    {
+      "epoch": 0.4918032786885246,
+      "grad_norm": 0.8451815247535706,
+      "learning_rate": 1.974798497735038e-05,
+      "loss": 0.9556,
+      "step": 150
+    },
+    {
+      "epoch": 0.49836065573770494,
+      "grad_norm": 0.8128353357315063,
+      "learning_rate": 1.9741902158673524e-05,
+      "loss": 0.9562,
+      "step": 152
+    },
+    {
+      "epoch": 0.5049180327868853,
+      "grad_norm": 0.8914998173713684,
+      "learning_rate": 1.972952183411495e-05,
+      "loss": 0.94,
+      "step": 154
+    },
+    {
+      "epoch": 0.5114754098360655,
+      "grad_norm": 0.853595495223999,
+      "learning_rate": 1.971685556403543e-05,
+      "loss": 0.9464,
+      "step": 156
+    },
+    {
+      "epoch": 0.5180327868852459,
+      "grad_norm": 0.8786745071411133,
+      "learning_rate": 1.9703903720689954e-05,
+      "loss": 0.9173,
+      "step": 158
+    },
+    {
+      "epoch": 0.5245901639344263,
+      "grad_norm": 0.8551218509674072,
+      "learning_rate": 1.9690666684726382e-05,
+      "loss": 0.9355,
+      "step": 160
+    },
+    {
+      "epoch": 0.5311475409836065,
+      "grad_norm": 0.8704633116722107,
+      "learning_rate": 1.9677144845174227e-05,
+      "loss": 0.9343,
+      "step": 162
+    },
+    {
+      "epoch": 0.5377049180327869,
+      "grad_norm": 0.873148500919342,
+      "learning_rate": 1.966333859943323e-05,
+      "loss": 0.925,
+      "step": 164
+    },
+    {
+      "epoch": 0.5442622950819672,
+      "grad_norm": 0.8460001349449158,
+      "learning_rate": 1.9649248353261673e-05,
+      "loss": 0.9286,
+      "step": 166
+    },
+    {
+      "epoch": 0.5508196721311476,
+      "grad_norm": 0.8666718006134033,
+      "learning_rate": 1.9634874520764478e-05,
+      "loss": 0.963,
+      "step": 168
+    },
+    {
+      "epoch": 0.5573770491803278,
+      "grad_norm": 0.937882661819458,
+      "learning_rate": 1.9620217524381007e-05,
+      "loss": 0.9232,
+      "step": 170
+    },
+    {
+      "epoch": 0.5639344262295082,
+      "grad_norm": 0.9387129545211792,
+      "learning_rate": 1.9605277794872656e-05,
+      "loss": 0.9351,
+      "step": 172
+    },
+    {
+      "epoch": 0.5704918032786885,
+      "grad_norm": 0.9671485424041748,
+      "learning_rate": 1.9590055771310212e-05,
+      "loss": 0.9674,
+      "step": 174
+    },
+    {
+      "epoch": 0.5770491803278689,
+      "grad_norm": 0.905163586139679,
+      "learning_rate": 1.9574551901060923e-05,
+      "loss": 0.9305,
+      "step": 176
+    },
+    {
+      "epoch": 0.5836065573770491,
+      "grad_norm": 0.9798932671546936,
+      "learning_rate": 1.955876663977537e-05,
+      "loss": 0.9267,
+      "step": 178
+    },
+    {
+      "epoch": 0.5901639344262295,
+      "grad_norm": 0.8553682565689087,
+      "learning_rate": 1.9542700451374068e-05,
+      "loss": 0.9053,
+      "step": 180
+    },
+    {
+      "epoch": 0.5967213114754099,
+      "grad_norm": 0.9667699337005615,
+      "learning_rate": 1.9526353808033827e-05,
+      "loss": 0.8957,
+      "step": 182
+    },
+    {
+      "epoch": 0.6032786885245902,
+      "grad_norm": 73.45634460449219,
+      "learning_rate": 1.9509727190173883e-05,
+      "loss": 0.9305,
+      "step": 184
+    },
+    {
+      "epoch": 0.6098360655737705,
+      "grad_norm": 1.0331788063049316,
+      "learning_rate": 1.949282108644178e-05,
+      "loss": 0.9105,
+      "step": 186
+    },
+    {
+      "epoch": 0.6163934426229508,
+      "grad_norm": 0.8748126029968262,
+      "learning_rate": 1.9475635993698995e-05,
+      "loss": 0.9232,
+      "step": 188
+    },
+    {
+      "epoch": 0.6229508196721312,
+      "grad_norm": 0.8973132371902466,
+      "learning_rate": 1.9458172417006347e-05,
+      "loss": 0.9148,
+      "step": 190
+    },
+    {
+      "epoch": 0.6295081967213115,
+      "grad_norm": 0.9401828050613403,
+      "learning_rate": 1.9440430869609167e-05,
+      "loss": 0.9679,
+      "step": 192
+    },
+    {
+      "epoch": 0.6360655737704918,
+      "grad_norm": 1.0153672695159912,
+      "learning_rate": 1.9422411872922173e-05,
+      "loss": 0.9044,
+      "step": 194
+    },
+    {
+      "epoch": 0.6426229508196721,
+      "grad_norm": 1.04591965675354,
+      "learning_rate": 1.9404115956514196e-05,
+      "loss": 0.926,
+      "step": 196
+    },
+    {
+      "epoch": 0.6491803278688525,
+      "grad_norm": 0.9370441436767578,
+      "learning_rate": 1.9385543658092572e-05,
+      "loss": 0.9427,
+      "step": 198
+    },
+    {
+      "epoch": 0.6557377049180327,
+      "grad_norm": 0.9781646728515625,
+      "learning_rate": 1.936669552348737e-05,
+      "loss": 0.899,
+      "step": 200
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 1220,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.887335423974277e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44207fb5ae35d66a04efe3ac870311d55bf06ee7dec676dcb7a600c416961632
+size 5777

checkpoint-200/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-400/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: ./Qwen3-8B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:./Qwen3-8B
+- lora
+- transformers
+- unsloth
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.16.0

checkpoint-400/adapter_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "./Qwen3-8B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "gate_proj",
+    "v_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": true
+}

checkpoint-400/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cdd1ef90e6372ff67d23b6dfdeabfff377a95c83ba616b9ab04c00634eb452c
+size 2834238032

checkpoint-400/added_tokens.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|analysis|>": 151670,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|forecast|>": 151671,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|response|>": 151669,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-400/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

checkpoint-400/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-400/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de58aea34018afb4e8b09e4d4b9c71f79d47ac7a2c06b20752c6d439369623e2
+size 698777675

checkpoint-400/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
+size 14645

checkpoint-400/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:329c939d6b39445051e8273b6c2f5ee67efebfee6455798df25c1099b25e25f2
+size 1383

checkpoint-400/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b3bbf31c74ff87cba344b3a29a618ca6f0a6601825cf4c833430ccb639cb9f4
+size 1465

checkpoint-400/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|analysis|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|forecast|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-400/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77247e5fb2e966d04e513068b17cca472e105e7c56953e9b1d27d70b93d77e6f
+size 11423221

checkpoint-400/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,254 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|analysis|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|forecast|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|response|>",
+    "<|analysis|>",
+    "<|forecast|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 40960,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-400/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1434 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.3114754098360657,
+  "eval_steps": 500,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006557377049180328,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0,
+      "loss": 3.1428,
+      "step": 2
+    },
+    {
+      "epoch": 0.013114754098360656,
+      "grad_norm": 30.30763816833496,
+      "learning_rate": 3.278688524590164e-07,
+      "loss": 3.408,
+      "step": 4
+    },
+    {
+      "epoch": 0.019672131147540985,
+      "grad_norm": 44.01264953613281,
+      "learning_rate": 9.836065573770493e-07,
+      "loss": 3.1786,
+      "step": 6
+    },
+    {
+      "epoch": 0.02622950819672131,
+      "grad_norm": 83.83026123046875,
+      "learning_rate": 1.3114754098360657e-06,
+      "loss": 4.3409,
+      "step": 8
+    },
+    {
+      "epoch": 0.03278688524590164,
+      "grad_norm": 28.788864135742188,
+      "learning_rate": 1.9672131147540985e-06,
+      "loss": 3.6102,
+      "step": 10
+    },
+    {
+      "epoch": 0.03934426229508197,
+      "grad_norm": 49.386905670166016,
+      "learning_rate": 2.6229508196721314e-06,
+      "loss": 3.6219,
+      "step": 12
+    },
+    {
+      "epoch": 0.04590163934426229,
+      "grad_norm": 34.787452697753906,
+      "learning_rate": 3.2786885245901638e-06,
+      "loss": 3.2803,
+      "step": 14
+    },
+    {
+      "epoch": 0.05245901639344262,
+      "grad_norm": 18.163414001464844,
+      "learning_rate": 3.934426229508197e-06,
+      "loss": 2.812,
+      "step": 16
+    },
+    {
+      "epoch": 0.05901639344262295,
+      "grad_norm": 22.347946166992188,
+      "learning_rate": 4.59016393442623e-06,
+      "loss": 2.5893,
+      "step": 18
+    },
+    {
+      "epoch": 0.06557377049180328,
+      "grad_norm": 14.794243812561035,
+      "learning_rate": 5.245901639344263e-06,
+      "loss": 2.3009,
+      "step": 20
+    },
+    {
+      "epoch": 0.07213114754098361,
+      "grad_norm": 20.02682113647461,
+      "learning_rate": 5.9016393442622956e-06,
+      "loss": 2.213,
+      "step": 22
+    },
+    {
+      "epoch": 0.07868852459016394,
+      "grad_norm": 5.247424602508545,
+      "learning_rate": 6.5573770491803276e-06,
+      "loss": 1.9407,
+      "step": 24
+    },
+    {
+      "epoch": 0.08524590163934426,
+      "grad_norm": NaN,
+      "learning_rate": 7.213114754098361e-06,
+      "loss": 1.8539,
+      "step": 26
+    },
+    {
+      "epoch": 0.09180327868852459,
+      "grad_norm": 3.0204479694366455,
+      "learning_rate": 7.540983606557377e-06,
+      "loss": 1.8437,
+      "step": 28
+    },
+    {
+      "epoch": 0.09836065573770492,
+      "grad_norm": 1.8878525495529175,
+      "learning_rate": 8.19672131147541e-06,
+      "loss": 1.7493,
+      "step": 30
+    },
+    {
+      "epoch": 0.10491803278688525,
+      "grad_norm": 1.5446749925613403,
+      "learning_rate": 8.852459016393443e-06,
+      "loss": 1.6798,
+      "step": 32
+    },
+    {
+      "epoch": 0.11147540983606558,
+      "grad_norm": 1.1978412866592407,
+      "learning_rate": 9.508196721311476e-06,
+      "loss": 1.6628,
+      "step": 34
+    },
+    {
+      "epoch": 0.1180327868852459,
+      "grad_norm": 0.9035641551017761,
+      "learning_rate": 1.0163934426229509e-05,
+      "loss": 1.5871,
+      "step": 36
+    },
+    {
+      "epoch": 0.12459016393442623,
+      "grad_norm": 0.765148401260376,
+      "learning_rate": 1.0819672131147544e-05,
+      "loss": 1.5404,
+      "step": 38
+    },
+    {
+      "epoch": 0.13114754098360656,
+      "grad_norm": 0.6516129970550537,
+      "learning_rate": 1.1475409836065575e-05,
+      "loss": 1.5619,
+      "step": 40
+    },
+    {
+      "epoch": 0.1377049180327869,
+      "grad_norm": NaN,
+      "learning_rate": 1.2131147540983608e-05,
+      "loss": 1.5199,
+      "step": 42
+    },
+    {
+      "epoch": 0.14426229508196722,
+      "grad_norm": 0.6104413866996765,
+      "learning_rate": 1.2459016393442624e-05,
+      "loss": 1.5179,
+      "step": 44
+    },
+    {
+      "epoch": 0.15081967213114755,
+      "grad_norm": 0.5764546990394592,
+      "learning_rate": 1.3114754098360655e-05,
+      "loss": 1.5158,
+      "step": 46
+    },
+    {
+      "epoch": 0.15737704918032788,
+      "grad_norm": 0.5812691450119019,
+      "learning_rate": 1.377049180327869e-05,
+      "loss": 1.4483,
+      "step": 48
+    },
+    {
+      "epoch": 0.16393442622950818,
+      "grad_norm": 0.5719778537750244,
+      "learning_rate": 1.4426229508196722e-05,
+      "loss": 1.4643,
+      "step": 50
+    },
+    {
+      "epoch": 0.17049180327868851,
+      "grad_norm": 0.571936845779419,
+      "learning_rate": 1.5081967213114754e-05,
+      "loss": 1.4129,
+      "step": 52
+    },
+    {
+      "epoch": 0.17704918032786884,
+      "grad_norm": 0.5835412740707397,
+      "learning_rate": 1.5737704918032788e-05,
+      "loss": 1.4049,
+      "step": 54
+    },
+    {
+      "epoch": 0.18360655737704917,
+      "grad_norm": 0.5831025242805481,
+      "learning_rate": 1.639344262295082e-05,
+      "loss": 1.3912,
+      "step": 56
+    },
+    {
+      "epoch": 0.1901639344262295,
+      "grad_norm": 0.594451367855072,
+      "learning_rate": 1.7049180327868854e-05,
+      "loss": 1.334,
+      "step": 58
+    },
+    {
+      "epoch": 0.19672131147540983,
+      "grad_norm": 0.6067811846733093,
+      "learning_rate": 1.7704918032786887e-05,
+      "loss": 1.3164,
+      "step": 60
+    },
+    {
+      "epoch": 0.20327868852459016,
+      "grad_norm": 1.0636777877807617,
+      "learning_rate": 1.836065573770492e-05,
+      "loss": 1.3004,
+      "step": 62
+    },
+    {
+      "epoch": 0.2098360655737705,
+      "grad_norm": 0.6091246008872986,
+      "learning_rate": 1.9016393442622952e-05,
+      "loss": 1.2634,
+      "step": 64
+    },
+    {
+      "epoch": 0.21639344262295082,
+      "grad_norm": 0.5987696051597595,
+      "learning_rate": 1.9672131147540985e-05,
+      "loss": 1.226,
+      "step": 66
+    },
+    {
+      "epoch": 0.22295081967213115,
+      "grad_norm": 0.6262068152427673,
+      "learning_rate": 1.9999963263091053e-05,
+      "loss": 1.2714,
+      "step": 68
+    },
+    {
+      "epoch": 0.22950819672131148,
+      "grad_norm": 0.5762652158737183,
+      "learning_rate": 1.9999669369438976e-05,
+      "loss": 1.2161,
+      "step": 70
+    },
+    {
+      "epoch": 0.2360655737704918,
+      "grad_norm": 0.5977967381477356,
+      "learning_rate": 1.99990815907722e-05,
+      "loss": 1.2051,
+      "step": 72
+    },
+    {
+      "epoch": 0.24262295081967214,
+      "grad_norm": 0.5854564905166626,
+      "learning_rate": 1.9998199944365234e-05,
+      "loss": 1.1852,
+      "step": 74
+    },
+    {
+      "epoch": 0.24918032786885247,
+      "grad_norm": 0.6072487235069275,
+      "learning_rate": 1.9997024456129198e-05,
+      "loss": 1.1669,
+      "step": 76
+    },
+    {
+      "epoch": 0.25573770491803277,
+      "grad_norm": 0.6043636202812195,
+      "learning_rate": 1.9995555160611073e-05,
+      "loss": 1.1619,
+      "step": 78
+    },
+    {
+      "epoch": 0.26229508196721313,
+      "grad_norm": 0.6353415250778198,
+      "learning_rate": 1.9993792100992683e-05,
+      "loss": 1.189,
+      "step": 80
+    },
+    {
+      "epoch": 0.26885245901639343,
+      "grad_norm": 0.5906977653503418,
+      "learning_rate": 1.9991735329089416e-05,
+      "loss": 1.1305,
+      "step": 82
+    },
+    {
+      "epoch": 0.2754098360655738,
+      "grad_norm": 0.6816039681434631,
+      "learning_rate": 1.9989384905348718e-05,
+      "loss": 1.1603,
+      "step": 84
+    },
+    {
+      "epoch": 0.2819672131147541,
+      "grad_norm": 0.6609848141670227,
+      "learning_rate": 1.9986740898848306e-05,
+      "loss": 1.1291,
+      "step": 86
+    },
+    {
+      "epoch": 0.28852459016393445,
+      "grad_norm": 0.6140775680541992,
+      "learning_rate": 1.9983803387294138e-05,
+      "loss": 1.1099,
+      "step": 88
+    },
+    {
+      "epoch": 0.29508196721311475,
+      "grad_norm": 1.0536493062973022,
+      "learning_rate": 1.9980572457018124e-05,
+      "loss": 1.1065,
+      "step": 90
+    },
+    {
+      "epoch": 0.3016393442622951,
+      "grad_norm": 0.6520818471908569,
+      "learning_rate": 1.997704820297561e-05,
+      "loss": 1.0993,
+      "step": 92
+    },
+    {
+      "epoch": 0.3081967213114754,
+      "grad_norm": 0.6376558542251587,
+      "learning_rate": 1.9973230728742563e-05,
+      "loss": 1.1249,
+      "step": 94
+    },
+    {
+      "epoch": 0.31475409836065577,
+      "grad_norm": 0.6940956115722656,
+      "learning_rate": 1.9969120146512542e-05,
+      "loss": 1.0645,
+      "step": 96
+    },
+    {
+      "epoch": 0.32131147540983607,
+      "grad_norm": 0.669341504573822,
+      "learning_rate": 1.996471657709339e-05,
+      "loss": 1.0702,
+      "step": 98
+    },
+    {
+      "epoch": 0.32786885245901637,
+      "grad_norm": 0.6368036866188049,
+      "learning_rate": 1.9960020149903693e-05,
+      "loss": 1.0681,
+      "step": 100
+    },
+    {
+      "epoch": 0.3344262295081967,
+      "grad_norm": 0.7006358504295349,
+      "learning_rate": 1.9955031002968972e-05,
+      "loss": 1.0446,
+      "step": 102
+    },
+    {
+      "epoch": 0.34098360655737703,
+      "grad_norm": 0.6593023538589478,
+      "learning_rate": 1.9949749282917628e-05,
+      "loss": 1.0522,
+      "step": 104
+    },
+    {
+      "epoch": 0.3475409836065574,
+      "grad_norm": 0.6703020334243774,
+      "learning_rate": 1.994417514497663e-05,
+      "loss": 1.0485,
+      "step": 106
+    },
+    {
+      "epoch": 0.3540983606557377,
+      "grad_norm": 0.7253920435905457,
+      "learning_rate": 1.9938308752966957e-05,
+      "loss": 1.0748,
+      "step": 108
+    },
+    {
+      "epoch": 0.36065573770491804,
+      "grad_norm": 0.7168115377426147,
+      "learning_rate": 1.993215027929878e-05,
+      "loss": 1.0431,
+      "step": 110
+    },
+    {
+      "epoch": 0.36721311475409835,
+      "grad_norm": 0.7738324999809265,
+      "learning_rate": 1.992569990496639e-05,
+      "loss": 1.0494,
+      "step": 112
+    },
+    {
+      "epoch": 0.3737704918032787,
+      "grad_norm": 0.7249582409858704,
+      "learning_rate": 1.9918957819542895e-05,
+      "loss": 1.0335,
+      "step": 114
+    },
+    {
+      "epoch": 0.380327868852459,
+      "grad_norm": 0.7557644248008728,
+      "learning_rate": 1.9911924221174638e-05,
+      "loss": 1.0451,
+      "step": 116
+    },
+    {
+      "epoch": 0.38688524590163936,
+      "grad_norm": 0.7169610261917114,
+      "learning_rate": 1.990459931657536e-05,
+      "loss": 1.0057,
+      "step": 118
+    },
+    {
+      "epoch": 0.39344262295081966,
+      "grad_norm": 0.7137094736099243,
+      "learning_rate": 1.989698332102015e-05,
+      "loss": 1.0099,
+      "step": 120
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.7775920629501343,
+      "learning_rate": 1.9889076458339116e-05,
+      "loss": 0.9981,
+      "step": 122
+    },
+    {
+      "epoch": 0.4065573770491803,
+      "grad_norm": 0.7513145804405212,
+      "learning_rate": 1.9880878960910772e-05,
+      "loss": 0.9943,
+      "step": 124
+    },
+    {
+      "epoch": 0.4131147540983607,
+      "grad_norm": 0.7948256134986877,
+      "learning_rate": 1.9872391069655258e-05,
+      "loss": 0.9812,
+      "step": 126
+    },
+    {
+      "epoch": 0.419672131147541,
+      "grad_norm": 1.0542031526565552,
+      "learning_rate": 1.9863613034027224e-05,
+      "loss": 1.0288,
+      "step": 128
+    },
+    {
+      "epoch": 0.4262295081967213,
+      "grad_norm": 0.7674461603164673,
+      "learning_rate": 1.9854545112008514e-05,
+      "loss": 0.9947,
+      "step": 130
+    },
+    {
+      "epoch": 0.43278688524590164,
+      "grad_norm": 0.7810584306716919,
+      "learning_rate": 1.9845187570100576e-05,
+      "loss": 1.0351,
+      "step": 132
+    },
+    {
+      "epoch": 0.43934426229508194,
+      "grad_norm": 0.745934247970581,
+      "learning_rate": 1.983554068331664e-05,
+      "loss": 0.995,
+      "step": 134
+    },
+    {
+      "epoch": 0.4459016393442623,
+      "grad_norm": 0.8037360906600952,
+      "learning_rate": 1.982560473517362e-05,
+      "loss": 0.978,
+      "step": 136
+    },
+    {
+      "epoch": 0.4524590163934426,
+      "grad_norm": 0.8278532028198242,
+      "learning_rate": 1.9815380017683804e-05,
+      "loss": 0.9646,
+      "step": 138
+    },
+    {
+      "epoch": 0.45901639344262296,
+      "grad_norm": 0.8025350570678711,
+      "learning_rate": 1.9804866831346254e-05,
+      "loss": 0.9976,
+      "step": 140
+    },
+    {
+      "epoch": 0.46557377049180326,
+      "grad_norm": 0.7931703925132751,
+      "learning_rate": 1.9794065485137973e-05,
+      "loss": 0.9558,
+      "step": 142
+    },
+    {
+      "epoch": 0.4721311475409836,
+      "grad_norm": 0.8693389892578125,
+      "learning_rate": 1.9782976296504833e-05,
+      "loss": 0.9804,
+      "step": 144
+    },
+    {
+      "epoch": 0.4786885245901639,
+      "grad_norm": 0.9033321738243103,
+      "learning_rate": 1.9771599591352254e-05,
+      "loss": 0.9983,
+      "step": 146
+    },
+    {
+      "epoch": 0.4852459016393443,
+      "grad_norm": 0.8326368927955627,
+      "learning_rate": 1.97599357040356e-05,
+      "loss": 0.9614,
+      "step": 148
+    },
+    {
+      "epoch": 0.4918032786885246,
+      "grad_norm": 0.8451815247535706,
+      "learning_rate": 1.974798497735038e-05,
+      "loss": 0.9556,
+      "step": 150
+    },
+    {
+      "epoch": 0.49836065573770494,
+      "grad_norm": 0.8128353357315063,
+      "learning_rate": 1.9741902158673524e-05,
+      "loss": 0.9562,
+      "step": 152
+    },
+    {
+      "epoch": 0.5049180327868853,
+      "grad_norm": 0.8914998173713684,
+      "learning_rate": 1.972952183411495e-05,
+      "loss": 0.94,
+      "step": 154
+    },
+    {
+      "epoch": 0.5114754098360655,
+      "grad_norm": 0.853595495223999,
+      "learning_rate": 1.971685556403543e-05,
+      "loss": 0.9464,
+      "step": 156
+    },
+    {
+      "epoch": 0.5180327868852459,
+      "grad_norm": 0.8786745071411133,
+      "learning_rate": 1.9703903720689954e-05,
+      "loss": 0.9173,
+      "step": 158
+    },
+    {
+      "epoch": 0.5245901639344263,
+      "grad_norm": 0.8551218509674072,
+      "learning_rate": 1.9690666684726382e-05,
+      "loss": 0.9355,
+      "step": 160
+    },
+    {
+      "epoch": 0.5311475409836065,
+      "grad_norm": 0.8704633116722107,
+      "learning_rate": 1.9677144845174227e-05,
+      "loss": 0.9343,
+      "step": 162
+    },
+    {
+      "epoch": 0.5377049180327869,
+      "grad_norm": 0.873148500919342,
+      "learning_rate": 1.966333859943323e-05,
+      "loss": 0.925,
+      "step": 164
+    },
+    {
+      "epoch": 0.5442622950819672,
+      "grad_norm": 0.8460001349449158,
+      "learning_rate": 1.9649248353261673e-05,
+      "loss": 0.9286,
+      "step": 166
+    },
+    {
+      "epoch": 0.5508196721311476,
+      "grad_norm": 0.8666718006134033,
+      "learning_rate": 1.9634874520764478e-05,
+      "loss": 0.963,
+      "step": 168
+    },
+    {
+      "epoch": 0.5573770491803278,
+      "grad_norm": 0.937882661819458,
+      "learning_rate": 1.9620217524381007e-05,
+      "loss": 0.9232,
+      "step": 170
+    },
+    {
+      "epoch": 0.5639344262295082,
+      "grad_norm": 0.9387129545211792,
+      "learning_rate": 1.9605277794872656e-05,
+      "loss": 0.9351,
+      "step": 172
+    },
+    {
+      "epoch": 0.5704918032786885,
+      "grad_norm": 0.9671485424041748,
+      "learning_rate": 1.9590055771310212e-05,
+      "loss": 0.9674,
+      "step": 174
+    },
+    {
+      "epoch": 0.5770491803278689,
+      "grad_norm": 0.905163586139679,
+      "learning_rate": 1.9574551901060923e-05,
+      "loss": 0.9305,
+      "step": 176
+    },
+    {
+      "epoch": 0.5836065573770491,
+      "grad_norm": 0.9798932671546936,
+      "learning_rate": 1.955876663977537e-05,
+      "loss": 0.9267,
+      "step": 178
+    },
+    {
+      "epoch": 0.5901639344262295,
+      "grad_norm": 0.8553682565689087,
+      "learning_rate": 1.9542700451374068e-05,
+      "loss": 0.9053,
+      "step": 180
+    },
+    {
+      "epoch": 0.5967213114754099,
+      "grad_norm": 0.9667699337005615,
+      "learning_rate": 1.9526353808033827e-05,
+      "loss": 0.8957,
+      "step": 182
+    },
+    {
+      "epoch": 0.6032786885245902,
+      "grad_norm": 73.45634460449219,
+      "learning_rate": 1.9509727190173883e-05,
+      "loss": 0.9305,
+      "step": 184
+    },
+    {
+      "epoch": 0.6098360655737705,
+      "grad_norm": 1.0331788063049316,
+      "learning_rate": 1.949282108644178e-05,
+      "loss": 0.9105,
+      "step": 186
+    },
+    {
+      "epoch": 0.6163934426229508,
+      "grad_norm": 0.8748126029968262,
+      "learning_rate": 1.9475635993698995e-05,
+      "loss": 0.9232,
+      "step": 188
+    },
+    {
+      "epoch": 0.6229508196721312,
+      "grad_norm": 0.8973132371902466,
+      "learning_rate": 1.9458172417006347e-05,
+      "loss": 0.9148,
+      "step": 190
+    },
+    {
+      "epoch": 0.6295081967213115,
+      "grad_norm": 0.9401828050613403,
+      "learning_rate": 1.9440430869609167e-05,
+      "loss": 0.9679,
+      "step": 192
+    },
+    {
+      "epoch": 0.6360655737704918,
+      "grad_norm": 1.0153672695159912,
+      "learning_rate": 1.9422411872922173e-05,
+      "loss": 0.9044,
+      "step": 194
+    },
+    {
+      "epoch": 0.6426229508196721,
+      "grad_norm": 1.04591965675354,
+      "learning_rate": 1.9404115956514196e-05,
+      "loss": 0.926,
+      "step": 196
+    },
+    {
+      "epoch": 0.6491803278688525,
+      "grad_norm": 0.9370441436767578,
+      "learning_rate": 1.9385543658092572e-05,
+      "loss": 0.9427,
+      "step": 198
+    },
+    {
+      "epoch": 0.6557377049180327,
+      "grad_norm": 0.9781646728515625,
+      "learning_rate": 1.936669552348737e-05,
+      "loss": 0.899,
+      "step": 200
+    },
+    {
+      "epoch": 0.6622950819672131,
+      "grad_norm": 0.9440310001373291,
+      "learning_rate": 1.9347572106635337e-05,
+      "loss": 0.8863,
+      "step": 202
+    },
+    {
+      "epoch": 0.6688524590163935,
+      "grad_norm": 0.9232677817344666,
+      "learning_rate": 1.932817396956362e-05,
+      "loss": 0.8961,
+      "step": 204
+    },
+    {
+      "epoch": 0.6754098360655738,
+      "grad_norm": 0.9292004108428955,
+      "learning_rate": 1.930850168237325e-05,
+      "loss": 0.9111,
+      "step": 206
+    },
+    {
+      "epoch": 0.6819672131147541,
+      "grad_norm": 0.9141654968261719,
+      "learning_rate": 1.928855582322238e-05,
+      "loss": 0.8943,
+      "step": 208
+    },
+    {
+      "epoch": 0.6885245901639344,
+      "grad_norm": 0.9214385747909546,
+      "learning_rate": 1.9268336978309303e-05,
+      "loss": 0.8904,
+      "step": 210
+    },
+    {
+      "epoch": 0.6950819672131148,
+      "grad_norm": 1.0522618293762207,
+      "learning_rate": 1.924784574185522e-05,
+      "loss": 0.9185,
+      "step": 212
+    },
+    {
+      "epoch": 0.7016393442622951,
+      "grad_norm": 1.0826475620269775,
+      "learning_rate": 1.9227082716086778e-05,
+      "loss": 0.9006,
+      "step": 214
+    },
+    {
+      "epoch": 0.7081967213114754,
+      "grad_norm": 0.9551975727081299,
+      "learning_rate": 1.920604851121836e-05,
+      "loss": 0.8768,
+      "step": 216
+    },
+    {
+      "epoch": 0.7147540983606557,
+      "grad_norm": 0.9394155144691467,
+      "learning_rate": 1.918474374543417e-05,
+      "loss": 0.8964,
+      "step": 218
+    },
+    {
+      "epoch": 0.7213114754098361,
+      "grad_norm": 0.9591704607009888,
+      "learning_rate": 1.916316904487005e-05,
+      "loss": 0.9061,
+      "step": 220
+    },
+    {
+      "epoch": 0.7278688524590164,
+      "grad_norm": 0.9659947752952576,
+      "learning_rate": 1.914132504359508e-05,
+      "loss": 0.893,
+      "step": 222
+    },
+    {
+      "epoch": 0.7344262295081967,
+      "grad_norm": 1.2988885641098022,
+      "learning_rate": 1.9119212383592956e-05,
+      "loss": 0.893,
+      "step": 224
+    },
+    {
+      "epoch": 0.740983606557377,
+      "grad_norm": 1.013238549232483,
+      "learning_rate": 1.90968317147431e-05,
+      "loss": 0.8958,
+      "step": 226
+    },
+    {
+      "epoch": 0.7475409836065574,
+      "grad_norm": 0.9777980446815491,
+      "learning_rate": 1.9074183694801582e-05,
+      "loss": 0.904,
+      "step": 228
+    },
+    {
+      "epoch": 0.7540983606557377,
+      "grad_norm": 0.9642871618270874,
+      "learning_rate": 1.905126898938177e-05,
+      "loss": 0.8897,
+      "step": 230
+    },
+    {
+      "epoch": 0.760655737704918,
+      "grad_norm": 1.0655618906021118,
+      "learning_rate": 1.9028088271934797e-05,
+      "loss": 0.9,
+      "step": 232
+    },
+    {
+      "epoch": 0.7672131147540984,
+      "grad_norm": 1.1008930206298828,
+      "learning_rate": 1.900464222372973e-05,
+      "loss": 0.879,
+      "step": 234
+    },
+    {
+      "epoch": 0.7737704918032787,
+      "grad_norm": 0.9866892695426941,
+      "learning_rate": 1.8980931533833568e-05,
+      "loss": 0.8746,
+      "step": 236
+    },
+    {
+      "epoch": 0.780327868852459,
+      "grad_norm": 1.0730581283569336,
+      "learning_rate": 1.8956956899091004e-05,
+      "loss": 0.8452,
+      "step": 238
+    },
+    {
+      "epoch": 0.7868852459016393,
+      "grad_norm": 1.008711338043213,
+      "learning_rate": 1.893271902410392e-05,
+      "loss": 0.8818,
+      "step": 240
+    },
+    {
+      "epoch": 0.7934426229508197,
+      "grad_norm": 1.0424623489379883,
+      "learning_rate": 1.890821862121069e-05,
+      "loss": 0.906,
+      "step": 242
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.0712602138519287,
+      "learning_rate": 1.888345641046525e-05,
+      "loss": 0.8828,
+      "step": 244
+    },
+    {
+      "epoch": 0.8065573770491803,
+      "grad_norm": 1.0236144065856934,
+      "learning_rate": 1.8858433119615932e-05,
+      "loss": 0.8715,
+      "step": 246
+    },
+    {
+      "epoch": 0.8131147540983606,
+      "grad_norm": 1.0133366584777832,
+      "learning_rate": 1.8833149484084064e-05,
+      "loss": 0.8538,
+      "step": 248
+    },
+    {
+      "epoch": 0.819672131147541,
+      "grad_norm": 1.029521107673645,
+      "learning_rate": 1.8807606246942382e-05,
+      "loss": 0.8853,
+      "step": 250
+    },
+    {
+      "epoch": 0.8262295081967214,
+      "grad_norm": 1.0954114198684692,
+      "learning_rate": 1.878180415889316e-05,
+      "loss": 0.8846,
+      "step": 252
+    },
+    {
+      "epoch": 0.8327868852459016,
+      "grad_norm": 1.0050263404846191,
+      "learning_rate": 1.8755743978246182e-05,
+      "loss": 0.881,
+      "step": 254
+    },
+    {
+      "epoch": 0.839344262295082,
+      "grad_norm": 1.0372755527496338,
+      "learning_rate": 1.8729426470896422e-05,
+      "loss": 0.8597,
+      "step": 256
+    },
+    {
+      "epoch": 0.8459016393442623,
+      "grad_norm": 1.0824848413467407,
+      "learning_rate": 1.8702852410301556e-05,
+      "loss": 0.8734,
+      "step": 258
+    },
+    {
+      "epoch": 0.8524590163934426,
+      "grad_norm": 0.9826651811599731,
+      "learning_rate": 1.8676022577459225e-05,
+      "loss": 0.8722,
+      "step": 260
+    },
+    {
+      "epoch": 0.8590163934426229,
+      "grad_norm": 1.0271152257919312,
+      "learning_rate": 1.8648937760884084e-05,
+      "loss": 0.8694,
+      "step": 262
+    },
+    {
+      "epoch": 0.8655737704918033,
+      "grad_norm": 1.0647377967834473,
+      "learning_rate": 1.8621598756584624e-05,
+      "loss": 0.8841,
+      "step": 264
+    },
+    {
+      "epoch": 0.8721311475409836,
+      "grad_norm": 1.0451794862747192,
+      "learning_rate": 1.8594006368039778e-05,
+      "loss": 0.8884,
+      "step": 266
+    },
+    {
+      "epoch": 0.8786885245901639,
+      "grad_norm": 1.1161409616470337,
+      "learning_rate": 1.8566161406175306e-05,
+      "loss": 0.8492,
+      "step": 268
+    },
+    {
+      "epoch": 0.8852459016393442,
+      "grad_norm": 1.2318251132965088,
+      "learning_rate": 1.8538064689339972e-05,
+      "loss": 0.8767,
+      "step": 270
+    },
+    {
+      "epoch": 0.8918032786885246,
+      "grad_norm": 1.0418727397918701,
+      "learning_rate": 1.850971704328148e-05,
+      "loss": 0.8516,
+      "step": 272
+    },
+    {
+      "epoch": 0.898360655737705,
+      "grad_norm": 1.060301661491394,
+      "learning_rate": 1.848111930112221e-05,
+      "loss": 0.8703,
+      "step": 274
+    },
+    {
+      "epoch": 0.9049180327868852,
+      "grad_norm": 1.0346226692199707,
+      "learning_rate": 1.8452272303334743e-05,
+      "loss": 0.8518,
+      "step": 276
+    },
+    {
+      "epoch": 0.9114754098360656,
+      "grad_norm": 1.037509560585022,
+      "learning_rate": 1.8423176897717143e-05,
+      "loss": 0.8393,
+      "step": 278
+    },
+    {
+      "epoch": 0.9180327868852459,
+      "grad_norm": 1.1179131269454956,
+      "learning_rate": 1.8393833939368057e-05,
+      "loss": 0.8516,
+      "step": 280
+    },
+    {
+      "epoch": 0.9245901639344263,
+      "grad_norm": 1.1018941402435303,
+      "learning_rate": 1.8364244290661568e-05,
+      "loss": 0.8538,
+      "step": 282
+    },
+    {
+      "epoch": 0.9311475409836065,
+      "grad_norm": 1.0070247650146484,
+      "learning_rate": 1.8334408821221865e-05,
+      "loss": 0.8526,
+      "step": 284
+    },
+    {
+      "epoch": 0.9377049180327869,
+      "grad_norm": 1.096187710762024,
+      "learning_rate": 1.8304328407897678e-05,
+      "loss": 0.8784,
+      "step": 286
+    },
+    {
+      "epoch": 0.9442622950819672,
+      "grad_norm": 1.038619041442871,
+      "learning_rate": 1.8274003934736507e-05,
+      "loss": 0.8656,
+      "step": 288
+    },
+    {
+      "epoch": 0.9508196721311475,
+      "grad_norm": 1.0244433879852295,
+      "learning_rate": 1.8243436292958638e-05,
+      "loss": 0.864,
+      "step": 290
+    },
+    {
+      "epoch": 0.9573770491803278,
+      "grad_norm": 1.0341455936431885,
+      "learning_rate": 1.8228061564299654e-05,
+      "loss": 0.8546,
+      "step": 292
+    },
+    {
+      "epoch": 0.9639344262295082,
+      "grad_norm": 1.0983253717422485,
+      "learning_rate": 1.819713085626076e-05,
+      "loss": 0.8448,
+      "step": 294
+    },
+    {
+      "epoch": 0.9704918032786886,
+      "grad_norm": 1.0788861513137817,
+      "learning_rate": 1.8165959238864446e-05,
+      "loss": 0.8276,
+      "step": 296
+    },
+    {
+      "epoch": 0.9770491803278688,
+      "grad_norm": 1.0404618978500366,
+      "learning_rate": 1.813454762822813e-05,
+      "loss": 0.8398,
+      "step": 298
+    },
+    {
+      "epoch": 0.9836065573770492,
+      "grad_norm": 1.0631470680236816,
+      "learning_rate": 1.81028969475225e-05,
+      "loss": 0.8364,
+      "step": 300
+    },
+    {
+      "epoch": 0.9901639344262295,
+      "grad_norm": 1.0844906568527222,
+      "learning_rate": 1.8071008126944386e-05,
+      "loss": 0.8817,
+      "step": 302
+    },
+    {
+      "epoch": 0.9967213114754099,
+      "grad_norm": 1.107038974761963,
+      "learning_rate": 1.8038882103689425e-05,
+      "loss": 0.8274,
+      "step": 304
+    },
+    {
+      "epoch": 1.0032786885245901,
+      "grad_norm": 1.061295747756958,
+      "learning_rate": 1.800651982192452e-05,
+      "loss": 0.8206,
+      "step": 306
+    },
+    {
+      "epoch": 1.0098360655737706,
+      "grad_norm": 1.1456193923950195,
+      "learning_rate": 1.7973922232760074e-05,
+      "loss": 0.8625,
+      "step": 308
+    },
+    {
+      "epoch": 1.0163934426229508,
+      "grad_norm": 1.1260658502578735,
+      "learning_rate": 1.7941090294222067e-05,
+      "loss": 0.806,
+      "step": 310
+    },
+    {
+      "epoch": 1.022950819672131,
+      "grad_norm": 1.068522334098816,
+      "learning_rate": 1.7908024971223875e-05,
+      "loss": 0.8067,
+      "step": 312
+    },
+    {
+      "epoch": 1.0295081967213116,
+      "grad_norm": 1.1286215782165527,
+      "learning_rate": 1.787472723553792e-05,
+      "loss": 0.8588,
+      "step": 314
+    },
+    {
+      "epoch": 1.0360655737704918,
+      "grad_norm": 1.152809977531433,
+      "learning_rate": 1.7841198065767107e-05,
+      "loss": 0.8243,
+      "step": 316
+    },
+    {
+      "epoch": 1.042622950819672,
+      "grad_norm": 1.128699541091919,
+      "learning_rate": 1.7807438447316076e-05,
+      "loss": 0.8271,
+      "step": 318
+    },
+    {
+      "epoch": 1.0491803278688525,
+      "grad_norm": 1.0822534561157227,
+      "learning_rate": 1.7773449372362232e-05,
+      "loss": 0.8553,
+      "step": 320
+    },
+    {
+      "epoch": 1.0557377049180328,
+      "grad_norm": 1.0860483646392822,
+      "learning_rate": 1.7739231839826573e-05,
+      "loss": 0.823,
+      "step": 322
+    },
+    {
+      "epoch": 1.0622950819672132,
+      "grad_norm": 1.0862377882003784,
+      "learning_rate": 1.7704786855344362e-05,
+      "loss": 0.8181,
+      "step": 324
+    },
+    {
+      "epoch": 1.0688524590163935,
+      "grad_norm": 1.1501132249832153,
+      "learning_rate": 1.767011543123554e-05,
+      "loss": 0.8183,
+      "step": 326
+    },
+    {
+      "epoch": 1.0754098360655737,
+      "grad_norm": 1.1655324697494507,
+      "learning_rate": 1.7635218586474997e-05,
+      "loss": 0.8252,
+      "step": 328
+    },
+    {
+      "epoch": 1.0819672131147542,
+      "grad_norm": 1.1589093208312988,
+      "learning_rate": 1.7600097346662623e-05,
+      "loss": 0.8075,
+      "step": 330
+    },
+    {
+      "epoch": 1.0885245901639344,
+      "grad_norm": 1.2046269178390503,
+      "learning_rate": 1.7564752743993145e-05,
+      "loss": 0.8412,
+      "step": 332
+    },
+    {
+      "epoch": 1.0950819672131147,
+      "grad_norm": 1.1746525764465332,
+      "learning_rate": 1.7529185817225814e-05,
+      "loss": 0.8393,
+      "step": 334
+    },
+    {
+      "epoch": 1.1016393442622952,
+      "grad_norm": 1.167726755142212,
+      "learning_rate": 1.7493397611653878e-05,
+      "loss": 0.8107,
+      "step": 336
+    },
+    {
+      "epoch": 1.1081967213114754,
+      "grad_norm": 1.1783404350280762,
+      "learning_rate": 1.745738917907384e-05,
+      "loss": 0.86,
+      "step": 338
+    },
+    {
+      "epoch": 1.1147540983606556,
+      "grad_norm": 1.1371574401855469,
+      "learning_rate": 1.7421161577754565e-05,
+      "loss": 0.7735,
+      "step": 340
+    },
+    {
+      "epoch": 1.1213114754098361,
+      "grad_norm": 33.01966094970703,
+      "learning_rate": 1.738471587240617e-05,
+      "loss": 0.8054,
+      "step": 342
+    },
+    {
+      "epoch": 1.1278688524590164,
+      "grad_norm": 1.2561206817626953,
+      "learning_rate": 1.7348053134148727e-05,
+      "loss": 0.8005,
+      "step": 344
+    },
+    {
+      "epoch": 1.1344262295081968,
+      "grad_norm": 1.304589033126831,
+      "learning_rate": 1.7329640714149125e-05,
+      "loss": 0.8116,
+      "step": 346
+    },
+    {
+      "epoch": 1.140983606557377,
+      "grad_norm": 1.1576813459396362,
+      "learning_rate": 1.729265444882255e-05,
+      "loss": 0.8001,
+      "step": 348
+    },
+    {
+      "epoch": 1.1475409836065573,
+      "grad_norm": 1.1452754735946655,
+      "learning_rate": 1.7255453856223674e-05,
+      "loss": 0.8338,
+      "step": 350
+    },
+    {
+      "epoch": 1.1540983606557378,
+      "grad_norm": 1.1715235710144043,
+      "learning_rate": 1.7218040029658316e-05,
+      "loss": 0.8203,
+      "step": 352
+    },
+    {
+      "epoch": 1.160655737704918,
+      "grad_norm": 1.183443546295166,
+      "learning_rate": 1.7180414068699126e-05,
+      "loss": 0.7913,
+      "step": 354
+    },
+    {
+      "epoch": 1.1672131147540983,
+      "grad_norm": 1.2050565481185913,
+      "learning_rate": 1.7142577079153272e-05,
+      "loss": 0.8081,
+      "step": 356
+    },
+    {
+      "epoch": 1.1737704918032787,
+      "grad_norm": 1.1526808738708496,
+      "learning_rate": 1.710453017302995e-05,
+      "loss": 0.777,
+      "step": 358
+    },
+    {
+      "epoch": 1.180327868852459,
+      "grad_norm": 1.171370267868042,
+      "learning_rate": 1.7066274468507677e-05,
+      "loss": 0.8122,
+      "step": 360
+    },
+    {
+      "epoch": 1.1868852459016392,
+      "grad_norm": 1.1098692417144775,
+      "learning_rate": 1.7027811089901465e-05,
+      "loss": 0.8231,
+      "step": 362
+    },
+    {
+      "epoch": 1.1934426229508197,
+      "grad_norm": 1.1354933977127075,
+      "learning_rate": 1.6989141167629743e-05,
+      "loss": 0.8029,
+      "step": 364
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 1.1700104475021362,
+      "learning_rate": 1.695026583818115e-05,
+      "loss": 0.7772,
+      "step": 366
+    },
+    {
+      "epoch": 1.2065573770491804,
+      "grad_norm": 1.162822961807251,
+      "learning_rate": 1.691118624408115e-05,
+      "loss": 0.8152,
+      "step": 368
+    },
+    {
+      "epoch": 1.2131147540983607,
+      "grad_norm": 11.616262435913086,
+      "learning_rate": 1.6871903533858417e-05,
+      "loss": 0.8046,
+      "step": 370
+    },
+    {
+      "epoch": 1.219672131147541,
+      "grad_norm": 1.1527999639511108,
+      "learning_rate": 1.683241886201111e-05,
+      "loss": 0.7901,
+      "step": 372
+    },
+    {
+      "epoch": 1.2262295081967214,
+      "grad_norm": 1.5522733926773071,
+      "learning_rate": 1.679273338897293e-05,
+      "loss": 0.813,
+      "step": 374
+    },
+    {
+      "epoch": 1.2327868852459016,
+      "grad_norm": 1.2122429609298706,
+      "learning_rate": 1.675284828107903e-05,
+      "loss": 0.8178,
+      "step": 376
+    },
+    {
+      "epoch": 1.2393442622950819,
+      "grad_norm": 1.209091305732727,
+      "learning_rate": 1.6712764710531716e-05,
+      "loss": 0.8095,
+      "step": 378
+    },
+    {
+      "epoch": 1.2459016393442623,
+      "grad_norm": 1.2088159322738647,
+      "learning_rate": 1.6672483855366003e-05,
+      "loss": 0.7983,
+      "step": 380
+    },
+    {
+      "epoch": 1.2524590163934426,
+      "grad_norm": 1.1982166767120361,
+      "learning_rate": 1.6632006899415016e-05,
+      "loss": 0.809,
+      "step": 382
+    },
+    {
+      "epoch": 1.2590163934426228,
+      "grad_norm": 1.2566733360290527,
+      "learning_rate": 1.659133503227515e-05,
+      "loss": 0.7993,
+      "step": 384
+    },
+    {
+      "epoch": 1.2655737704918033,
+      "grad_norm": 1.1701586246490479,
+      "learning_rate": 1.6550469449271166e-05,
+      "loss": 0.7905,
+      "step": 386
+    },
+    {
+      "epoch": 1.2721311475409836,
+      "grad_norm": 1.1296133995056152,
+      "learning_rate": 1.6509411351421015e-05,
+      "loss": 0.8283,
+      "step": 388
+    },
+    {
+      "epoch": 1.278688524590164,
+      "grad_norm": 1.2119910717010498,
+      "learning_rate": 1.6468161945400563e-05,
+      "loss": 0.8043,
+      "step": 390
+    },
+    {
+      "epoch": 1.2852459016393443,
+      "grad_norm": 1.211780071258545,
+      "learning_rate": 1.6426722443508125e-05,
+      "loss": 0.8048,
+      "step": 392
+    },
+    {
+      "epoch": 1.2918032786885245,
+      "grad_norm": 1.223976731300354,
+      "learning_rate": 1.6385094063628824e-05,
+      "loss": 0.7913,
+      "step": 394
+    },
+    {
+      "epoch": 1.298360655737705,
+      "grad_norm": 1.1694824695587158,
+      "learning_rate": 1.6343278029198814e-05,
+      "loss": 0.7963,
+      "step": 396
+    },
+    {
+      "epoch": 1.3049180327868852,
+      "grad_norm": 1.1707338094711304,
+      "learning_rate": 1.6301275569169323e-05,
+      "loss": 0.7838,
+      "step": 398
+    },
+    {
+      "epoch": 1.3114754098360657,
+      "grad_norm": 1.1705915927886963,
+      "learning_rate": 1.625908791797052e-05,
+      "loss": 0.7911,
+      "step": 400
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 1220,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.766063076789248e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-400/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44207fb5ae35d66a04efe3ac870311d55bf06ee7dec676dcb7a600c416961632
+size 5777

checkpoint-400/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff