distillabs commited on
Commit
2bd2a03
·
verified ·
1 Parent(s): 4dc0160

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GENERAL TERMS AND CONDITIONS
2
+
3
+ Note that if you want to use the Commercial licence, please contact us at contact@distillabs.ai
4
+
5
+ - Model License Terms -
6
+
7
+ R&D License
8
+
9
+ 1. SERVICES, PRICES AND PAYMENT
10
+
11
+ 1.1 The Customer pays a one-time license fee, as indicated in the check-out process, for running of one (1) training process of the selected Base Model using Customer Data (“License Fee”).
12
+
13
+ 1.2 The License Fee shall be due for payment in advance. The Customer shall only be permitted to set off against payment claims of Distil Labs if the Customer’s claims are undisputed or have become res judicata.
14
+
15
+ 2. MODEL LICENSE: R&D LICENSE
16
+
17
+ 2.1 Subject to Customer’s payment of the license fee, Distil Labs grants to Customer the Model License (as defined below). For clarification, Distil Labs retains any other rights in its software or know- how, in particular in the codebase needed for the fine-tuning of the Trained Model.
18
+
19
+ 2.2 Subject to the requirements of the Base Model License (cf. Section 2.5 below), Distil Labs transfers to the Customer the perpetual, non-exclusive usage right to the Trained Model for non-commercial purposes of prototyping and research & development. The Parties agree, that commercial purposes include deployment in production externally (to be used by Customer’s customers paid or free of charge) or internally (as a tool for Customer’s employees). The territorial scope of the license is limited to the use within the United States of America and the European Economic Area including all member states of the European Union (“Model License”).
20
+
21
+ 2.3 The Model License for non-commercial purposes of prototyping and research & development shall include (i) the non-exclusive right to permanent or temporary reproduction, in whole or in part, by any means and in any form (e.g. permanent and/or volatile storage on electrical, electromagnetic, optical storage media, such as any type of SDD, HDD, DVD, memory cards, USB sticks), (ii) the non-exclusive right to distribution in any form, media and by any means regardless of whether the distribution is in tangible or intangible form, in particular to transmit the Trained Model via wired and wireless networks (e.g. for download from internet or intranet by wire or wireless means including broadband, cable, fiberglass, WIFI, LTE, 5G, satellite internet, other data networks), and (iii) the non-exclusive right of making available to the public in such a way that members of the public can access it from places and at times of their choice (e.g. by web or mobile app, virtual or augmented reality, cloud storage, cloud hosting, decentralized hosting, non-fungible token, application service providing, software as a service, or cloud computing). The license shall also contain, to the extent necessary for prototyping and research & development, the right to adapt and modify the Trained Model subject to the limitation in Section 2.4 and 2.5 below, to further develop the Trained Model including changes to functions or appearance, adapt to other software versions, to exchange parts of the Trained Model or combine the Trained Model with other results of work and to use the results in the same way as the original Trained Model. Any derived models from the Trained Model shall retain this model license.
22
+
23
+ 2.4 The Customer shall not, without the prior written consent of Distil Labs:
24
+
25
+ 2.4.1 train, fine-tune, re-train, or otherwise modify the Trained Model, unless for purpose of research & development;
26
+
27
+ 2.4.2 use the Trained Model or any part thereof to create derivative models or services that compete with those of Distil Labs;
28
+
29
+ 2.4.3 circumvent any technical restrictions embedded in the Trained Model or Base Model that are designed to enforce usage limitations.
30
+
31
+ 2.5 The Parties acknowledge and agree that the Trained Model is developed from Base Models which are supplied by a third party. Therefore, the Model License is subject to the restrictions resulting from the open-source or any other applicable license of the Base Model (“Base Model License”) and the Customer must use the Trained Model in compliance with the Base Model License. In particular, the Customer must oblige their clients to compliance with the Base Model License in any case of transferring or sublicensing the rights to or making available in any way the Trained Model. The applicable Base Model License is defined in the Training Configuration and will be provided for download. The Customer agrees to indemnify Distil Labs for any and all claims brought by the Base Model provider for violations of the Base Model License.
Modelfile ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FROM ./model.gguf
3
+
4
+ TEMPLATE """{{- $lastUserIdx := -1 -}}
5
+ {{- range $idx, $msg := .Messages -}}
6
+ {{- if eq $msg.Role "user" }}{{ $lastUserIdx = $idx }}{{ end -}}
7
+ {{- end }}
8
+ {{- if or .System .Tools }}<|im_start|>system
9
+ {{ if .System }}{{ .System }}
10
+
11
+ {{ end }}
12
+ {{- if .Tools }}# Tools
13
+
14
+ You may call one or more functions to assist with the user query.
15
+
16
+ You are provided with function signatures within <tools></tools> XML tags:
17
+ <tools>
18
+ {{- range .Tools }}
19
+ {"type": "function", "function": {{ .Function }}}
20
+ {{- end }}
21
+ </tools>
22
+
23
+ For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
24
+ <tool_call>
25
+ {"name": <function-name>, "arguments": <args-json-object>}
26
+ </tool_call>
27
+ {{- end -}}
28
+ <|im_end|>
29
+ {{ end }}
30
+ {{- range $i, $_ := .Messages }}
31
+ {{- $last := eq (len (slice $.Messages $i)) 1 -}}
32
+ {{- if eq .Role "user" }}<|im_start|>user
33
+ {{ .Content }}<|im_end|>
34
+ {{ else if eq .Role "assistant" }}<|im_start|>assistant
35
+ {{ if .Content }}{{ .Content }}{{ end }}
36
+ {{- if .ToolCalls }}
37
+ {{- range .ToolCalls }}
38
+ <tool_call>
39
+ {"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
40
+ </tool_call>
41
+ {{- end }}
42
+ {{- end }}{{ if not $last }}<|im_end|>
43
+ {{ end }}
44
+ {{- else if eq .Role "tool" }}<|im_start|>user
45
+ <tool_response>
46
+ {{ .Content }}
47
+ </tool_response><|im_end|>
48
+ {{ end }}
49
+ {{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
50
+ {{ end }}
51
+ {{- end }}"""
README.md ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: Qwen/Qwen3-4B
5
+ tags:
6
+ - text2sql
7
+ - sql
8
+ - nlp
9
+ - distillation
10
+ - qwen3
11
+ datasets:
12
+ - distil-labs/text2sql-synthetic
13
+ language:
14
+ - en
15
+ pipeline_tag: text-generation
16
+ ---
17
+
18
+ # Distil-Qwen3-4B-Text2SQL
19
+
20
+ A fine-tuned Qwen3-4B model for converting natural language questions into SQL queries. Trained using knowledge distillation from DeepSeek-V3, this 4B parameter model matches teacher-level accuracy while being small enough to run locally.
21
+
22
+ ## Results
23
+
24
+ | Metric | DeepSeek-V3 (Teacher) | Qwen3-4B (Base) | **This Model** |
25
+ |--------|:---------------------:|:---------------:|:--------------:|
26
+ | LLM-as-a-Judge | 80% | 62% | **80%** |
27
+ | Exact Match | 48% | 16% | **60%** |
28
+ | ROUGE | 87.6% | 84.2% | **89.5%** |
29
+ | METEOR | 85.1% | 87.3% | 86.1% |
30
+
31
+ The fine-tuned model **matches the 685B parameter teacher** on LLM-as-a-Judge accuracy and **exceeds it** on exact match and ROUGE scores.
32
+
33
+ ## Quick Start
34
+
35
+ ### Using Transformers
36
+
37
+ ```python
38
+ from transformers import AutoModelForCausalLM, AutoTokenizer
39
+
40
+ model = AutoModelForCausalLM.from_pretrained("distil-labs/distil-qwen3-4b-text2sql")
41
+ tokenizer = AutoTokenizer.from_pretrained("distil-labs/distil-qwen3-4b-text2sql")
42
+
43
+ schema = """CREATE TABLE employees (
44
+ id INTEGER PRIMARY KEY,
45
+ name TEXT NOT NULL,
46
+ department TEXT,
47
+ salary INTEGER
48
+ );"""
49
+
50
+ question = "How many employees earn more than 50000?"
51
+
52
+ messages = [
53
+ {
54
+ "role": "system",
55
+ "content": """You are a problem solving model working on task_description XML block:
56
+ <task_description>You are given a database schema and a natural language question. Generate the SQL query that answers the question.
57
+
58
+ Input:
59
+ - Schema: One or two table definitions in SQL DDL format
60
+ - Question: Natural language question about the data
61
+
62
+ Output:
63
+ - A single SQL query that answers the question
64
+ - No explanations, comments, or additional text
65
+
66
+ Rules:
67
+ - Use only tables and columns from the provided schema
68
+ - Use uppercase SQL keywords (SELECT, FROM, WHERE, etc.)
69
+ - Use SQLite-compatible syntax</task_description>
70
+ You will be given a single task in the question XML block
71
+ Solve only the task in question block.
72
+ Generate only the answer, do not generate anything else"""
73
+ },
74
+ {
75
+ "role": "user",
76
+ "content": f"""Now for the real task, solve the task in question block.
77
+ Generate only the solution, do not generate anything else
78
+ <question>Schema:
79
+ {schema}
80
+
81
+ Question: {question}</question>"""
82
+ }
83
+ ]
84
+
85
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
86
+ inputs = tokenizer(text, return_tensors="pt")
87
+ outputs = model.generate(**inputs, max_new_tokens=256, temperature=0)
88
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
89
+ ```
90
+
91
+ ### Using Ollama (GGUF version)
92
+
93
+ For local inference, use the quantized GGUF versions:
94
+ - [distil-qwen3-4b-text2sql-gguf](https://huggingface.co/distil-labs/distil-qwen3-4b-text2sql-gguf) - Full precision GGUF
95
+ - [distil-qwen3-4b-text2sql-gguf-4bit](https://huggingface.co/distil-labs/distil-qwen3-4b-text2sql-gguf-4bit) - 4-bit quantized (~2.5GB)
96
+
97
+ ```bash
98
+ # Download and create Ollama model
99
+ ollama create distil-qwen3-4b-text2sql -f Modelfile
100
+
101
+ # Run inference
102
+ ollama run distil-qwen3-4b-text2sql
103
+ ```
104
+
105
+ ## Model Details
106
+
107
+ | Property | Value |
108
+ |----------|-------|
109
+ | Base Model | [Qwen/Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B) |
110
+ | Parameters | 4 billion |
111
+ | Architecture | Qwen3ForCausalLM |
112
+ | Context Length | 262,144 tokens |
113
+ | Precision | bfloat16 |
114
+ | Training Data | ~10,000 synthetic examples |
115
+ | Teacher Model | DeepSeek-V3 |
116
+
117
+ ## Training
118
+
119
+ This model was trained using the [Distil Labs](https://distillabs.ai) platform:
120
+
121
+ 1. **Seed Data**: 50 hand-validated Text2SQL examples covering various SQL complexities
122
+ 2. **Synthetic Generation**: Expanded to ~10,000 examples using DeepSeek-V3
123
+ 3. **Fine-tuning**: 4 epochs on the synthetic dataset
124
+ 4. **Evaluation**: LLM-as-a-Judge with semantic equivalence checking
125
+
126
+ ### Training Hyperparameters
127
+
128
+ - Epochs: 4
129
+ - Learning Rate: 5e-5 (cosine schedule)
130
+ - Batch Size: 1 (with gradient accumulation)
131
+ - Total Steps: ~40,000
132
+
133
+ ## Task Format
134
+
135
+ ### Input Format
136
+
137
+ ```
138
+ Schema:
139
+ CREATE TABLE table_name (
140
+ column_name DATA_TYPE [CONSTRAINTS],
141
+ ...
142
+ );
143
+
144
+ Question: Natural language question about the data
145
+ ```
146
+
147
+ ### Output Format
148
+
149
+ A single SQL query with:
150
+ - Uppercase SQL keywords (SELECT, FROM, WHERE, etc.)
151
+ - SQLite-compatible syntax
152
+ - No explanations or additional text
153
+
154
+ ### Supported SQL Features
155
+
156
+ - **Simple**: SELECT, WHERE, COUNT, SUM, AVG, MAX, MIN
157
+ - **Medium**: JOIN, GROUP BY, HAVING, ORDER BY, LIMIT
158
+ - **Complex**: Subqueries, multiple JOINs, UNION
159
+
160
+ ## Use Cases
161
+
162
+ - Natural language interfaces to databases
163
+ - SQL query assistance and autocompletion
164
+ - Database chatbots and conversational BI
165
+ - Educational tools for learning SQL
166
+
167
+ ## Limitations
168
+
169
+ - Optimized for SQLite syntax
170
+ - Best with 1-2 table schemas
171
+ - May struggle with highly complex nested subqueries
172
+ - Trained on English questions only
173
+
174
+ ## License
175
+
176
+ This model is released under the Apache 2.0 license.
177
+
178
+ ## Links
179
+
180
+ - [Distil Labs Website](https://distillabs.ai)
181
+ - [GitHub](https://github.com/distil-labs)
182
+ - [Hugging Face](https://huggingface.co/distil-labs)
183
+
184
+ ## Citation
185
+
186
+ ```bibtex
187
+ @misc{distil-qwen3-4b-text2sql,
188
+ author = {Distil Labs},
189
+ title = {Distil-Qwen3-4B-Text2SQL: A Fine-tuned Model for Natural Language to SQL},
190
+ year = {2025},
191
+ publisher = {Hugging Face},
192
+ url = {https://huggingface.co/distil-labs/distil-qwen3-4b-text2sql}
193
+ }
194
+ ```
STUDENT_LICENSE ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2023 Qwen
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
TEACHER_LICENSE ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 DeepSeek
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- for message in messages %}
18
+ {%- if message.content is string %}
19
+ {%- set content = message.content %}
20
+ {%- else %}
21
+ {%- set content = '' %}
22
+ {%- endif %}
23
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
24
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25
+ {%- elif message.role == "assistant" %}
26
+ {{- '<|im_start|>' + message.role + '\n' + content }}
27
+ {%- if message.tool_calls %}
28
+ {%- for tool_call in message.tool_calls %}
29
+ {%- if (loop.first and content) or (not loop.first) %}
30
+ {{- '\n' }}
31
+ {%- endif %}
32
+ {%- if tool_call.function %}
33
+ {%- set tool_call = tool_call.function %}
34
+ {%- endif %}
35
+ {{- '<tool_call>\n{"name": "' }}
36
+ {{- tool_call.name }}
37
+ {{- '", "arguments": ' }}
38
+ {%- if tool_call.arguments is string %}
39
+ {{- tool_call.arguments }}
40
+ {%- else %}
41
+ {{- tool_call.arguments | tojson }}
42
+ {%- endif %}
43
+ {{- '}\n</tool_call>' }}
44
+ {%- endfor %}
45
+ {%- endif %}
46
+ {{- '<|im_end|>\n' }}
47
+ {%- elif message.role == "tool" %}
48
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
49
+ {{- '<|im_start|>user' }}
50
+ {%- endif %}
51
+ {{- '\n<tool_response>\n' }}
52
+ {{- content }}
53
+ {{- '\n</tool_response>' }}
54
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
55
+ {{- '<|im_end|>\n' }}
56
+ {%- endif %}
57
+ {%- endif %}
58
+ {%- endfor %}
59
+ {%- if add_generation_prompt %}
60
+ {{- '<|im_start|>assistant\n' }}
61
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2560,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 9728,
14
+ "layer_types": [
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention"
51
+ ],
52
+ "max_position_embeddings": 262144,
53
+ "max_window_layers": 36,
54
+ "model_type": "qwen3",
55
+ "num_attention_heads": 32,
56
+ "num_hidden_layers": 36,
57
+ "num_key_value_heads": 8,
58
+ "pad_token": "<|endoftext|>",
59
+ "pad_token_id": 151643,
60
+ "rms_norm_eps": 1e-06,
61
+ "rope_scaling": null,
62
+ "rope_theta": 5000000,
63
+ "sliding_window": null,
64
+ "tie_word_embeddings": true,
65
+ "torch_dtype": "bfloat16",
66
+ "transformers_version": "4.53.0",
67
+ "use_cache": true,
68
+ "use_sliding_window": false,
69
+ "vocab_size": 151936
70
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.53.0"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ff64f59fedf1ec2cc9184e859cbe9ad5a41700566b6d6b0c5c24bfb18e15603
3
+ size 4967215360
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f81196c29e6d0913ee225ba6ede73339a4e357074325f6c709602feced13eda
3
+ size 3077766632
model.safetensors.index.json ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4022468096,
4
+ "total_size": 8044936192
5
+ },
6
+ "weight_map": {
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
152
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
153
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
156
+ "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
163
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
164
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
165
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
166
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
167
+ "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
168
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
169
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
170
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
171
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
172
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
173
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
174
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
175
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
176
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
177
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
178
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
179
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
180
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
181
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
182
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
183
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
184
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
185
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
186
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
187
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
188
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
189
+ "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
190
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
191
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
192
+ "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
193
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
194
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
195
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
196
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
197
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
198
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
199
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
200
+ "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
201
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
202
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
203
+ "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
204
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
205
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
206
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
207
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
208
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
209
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
210
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
211
+ "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
212
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
213
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
214
+ "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
215
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
216
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
217
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
218
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
219
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
220
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
221
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
222
+ "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
223
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
224
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
225
+ "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
226
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
227
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
228
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
229
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
230
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
232
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
233
+ "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
234
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
235
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
236
+ "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
237
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
238
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
239
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
240
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
241
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
242
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
243
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
244
+ "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
245
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
246
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
247
+ "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
248
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
249
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
250
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
251
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
252
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
253
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
254
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
255
+ "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
256
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
257
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
258
+ "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
259
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
260
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
266
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
273
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
274
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
280
+ "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
283
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
285
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
286
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
287
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
288
+ "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
289
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
290
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
291
+ "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
292
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
293
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
294
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
295
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
296
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
297
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
298
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
304
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
307
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
316
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
319
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
328
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
331
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
333
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
339
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
340
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
341
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
342
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
343
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
344
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
345
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
346
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
347
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
348
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
349
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
350
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
351
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
352
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
353
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
354
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
355
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
356
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
357
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
358
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
359
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
360
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
361
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
362
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
363
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
364
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
365
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
366
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
367
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
368
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
369
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
370
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
371
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
372
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
373
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
375
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
376
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
377
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
378
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
379
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
380
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
381
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
382
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
384
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
385
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
386
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
388
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
389
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
390
+ "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
391
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
392
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
393
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
394
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
395
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
398
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
400
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
402
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
403
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
404
+ "model.norm.weight": "model-00002-of-00002.safetensors"
405
+ }
406
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "bos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "eos_token": {
25
+ "content": "<|im_end|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "pad_token": {
32
+ "content": "<|endoftext|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ }
38
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60927effd4aeb00104fe4716572e8db4041e32c0f2d175d8ae56816977013845
3
+ size 11422924
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": "<|endoftext|>",
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 1010000,
235
+ "pad_token": "<|endoftext|>",
236
+ "padding_side": "left",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
training-logs.csv ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,eval_loss,eval_binary,eval_rouge,eval_llm_as_a_judge,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch,step,loss,grad_norm,learning_rate,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
2
+ 0,1.0263168811798096,0.0,0.8630774330746501,0.0,49.0875,1.019,1.019,0.0,0,,,,,,,,
3
+ 1,,,,,,,,0.02477455158061639,250,0.5324,1.523032546043396,6.067360079247153e-06,,,,,
4
+ 2,,,,,,,,0.04954910316123278,500,0.1588,1.8619871139526367,1.225854383358098e-05,,,,,
5
+ 3,,,,,,,,0.07432365474184917,750,0.1432,0.046056393533945084,1.8449727587914812e-05,,,,,
6
+ 4,,,,,,,,0.09909820632246556,1000,0.1222,0.767810583114624,2.464091134224864e-05,,,,,
7
+ 5,,,,,,,,0.12387275790308196,1250,0.1289,0.08577042073011398,3.083209509658247e-05,,,,,
8
+ 6,,,,,,,,0.14864730948369834,1500,0.123,0.9776291847229004,3.70232788509163e-05,,,,,
9
+ 7,,,,,,,,0.17342186106431473,1750,0.1247,0.6648195385932922,4.321446260525013e-05,,,,,
10
+ 8,,,,,,,,0.19819641264493112,2000,0.1222,0.10744159668684006,4.940564635958396e-05,,,,,
11
+ 9,,,,,,,,0.22297096422554752,2250,0.1197,0.4264952838420868,4.970530708045378e-05,,,,,
12
+ 10,,,,,,,,0.2477455158061639,2500,0.1319,0.7932649850845337,4.937931933759291e-05,,,,,
13
+ 11,,,,,,,,0.2725200673867803,2750,0.124,0.016937250271439552,4.905333159473204e-05,,,,,
14
+ 12,,,,,,,,0.2972946189673967,3000,0.112,1.084240198135376,4.8727343851871173e-05,,,,,
15
+ 13,,,,,,,,0.3220691705480131,3250,0.1105,0.058936528861522675,4.84013561090103e-05,,,,,
16
+ 14,,,,,,,,0.34684372212862946,3500,0.117,2.2370946407318115,4.8075368366149436e-05,,,,,
17
+ 15,,,,,,,,0.3716182737092459,3750,0.1142,0.25352439284324646,4.774938062328856e-05,,,,,
18
+ 16,,,,,,,,0.39639282528986225,4000,0.115,0.746067225933075,4.74233928804277e-05,,,,,
19
+ 17,,,,,,,,0.42116737687047867,4250,0.1315,0.5937117338180542,4.709740513756683e-05,,,,,
20
+ 18,,,,,,,,0.44594192845109504,4500,0.1148,0.0067167095839977264,4.677141739470596e-05,,,,,
21
+ 19,,,,,,,,0.4707164800317114,4750,0.1094,0.6822088360786438,4.644542965184509e-05,,,,,
22
+ 20,,,,,,,,0.4954910316123278,5000,0.1123,0.5732296705245972,4.611944190898422e-05,,,,,
23
+ 21,,,,,,,,0.5202655831929442,5250,0.105,1.033423662185669,4.5793454166123356e-05,,,,,
24
+ 22,,,,,,,,0.5450401347735606,5500,0.1097,0.15889854729175568,4.546746642326249e-05,,,,,
25
+ 23,,,,,,,,0.569814686354177,5750,0.1125,1.1788893938064575,4.514147868040162e-05,,,,,
26
+ 24,,,,,,,,0.5945892379347933,6000,0.1078,0.23025964200496674,4.4815490937540746e-05,,,,,
27
+ 25,,,,,,,,0.6193637895154098,6250,0.1026,0.9611893892288208,4.448950319467988e-05,,,,,
28
+ 26,,,,,,,,0.6441383410960262,6500,0.0974,0.016058839857578278,4.4163515451819015e-05,,,,,
29
+ 27,,,,,,,,0.6689128926766426,6750,0.0965,0.3839409649372101,4.383752770895814e-05,,,,,
30
+ 28,,,,,,,,0.6936874442572589,7000,0.1098,0.3105497658252716,4.351153996609728e-05,,,,,
31
+ 29,,,,,,,,0.7184619958378753,7250,0.1116,0.04357475787401199,4.318555222323641e-05,,,,,
32
+ 30,,,,,,,,0.7432365474184918,7500,0.1061,0.26533493399620056,4.285956448037554e-05,,,,,
33
+ 31,,,,,,,,0.7680110989991081,7750,0.1009,0.647885262966156,4.253357673751467e-05,,,,,
34
+ 32,,,,,,,,0.7927856505797245,8000,0.0929,1.25872802734375,4.22075889946538e-05,,,,,
35
+ 33,,,,,,,,0.8175602021603409,8250,0.0992,0.7607027888298035,4.1881601251792936e-05,,,,,
36
+ 34,,,,,,,,0.8423347537409573,8500,0.097,0.023414086550474167,4.155561350893207e-05,,,,,
37
+ 35,,,,,,,,0.8671093053215737,8750,0.0974,1.617354154586792,4.12296257660712e-05,,,,,
38
+ 36,,,,,,,,0.8918838569021901,9000,0.0992,0.348496675491333,4.0903638023210325e-05,,,,,
39
+ 37,,,,,,,,0.9166584084828064,9250,0.1068,0.45875710248947144,4.057765028034946e-05,,,,,
40
+ 38,,,,,,,,0.9414329600634228,9500,0.0989,0.5602991580963135,4.0251662537488594e-05,,,,,
41
+ 39,,,,,,,,0.9662075116440393,9750,0.0957,0.3049614131450653,3.992567479462773e-05,,,,,
42
+ 40,,,,,,,,0.9909820632246557,10000,0.1069,0.5346589684486389,3.9599687051766856e-05,,,,,
43
+ 41,0.06276851147413254,0.5,0.9718670465458659,0.5,39.1908,1.276,1.276,1.0,10091,,,,,,,,
44
+ 42,,,,,,,,1.0157566148052721,10250,0.093,1.198898434638977,3.9273699308905984e-05,,,,,
45
+ 43,,,,,,,,1.0405311663858885,10500,0.0856,0.5537581443786621,3.894771156604512e-05,,,,,
46
+ 44,,,,,,,,1.0653057179665049,10750,0.0932,0.7537667155265808,3.862172382318425e-05,,,,,
47
+ 45,,,,,,,,1.0900802695471212,11000,0.0842,0.5561412572860718,3.829573608032338e-05,,,,,
48
+ 46,,,,,,,,1.1148548211277376,11250,0.0867,0.6610764861106873,3.797105228843396e-05,,,,,
49
+ 47,,,,,,,,1.139629372708354,11500,0.08,2.3849334716796875,3.7645064545573086e-05,,,,,
50
+ 48,,,,,,,,1.1644039242889703,11750,0.0848,0.4288468062877655,3.731907680271222e-05,,,,,
51
+ 49,,,,,,,,1.1891784758695867,12000,0.0825,1.3528180122375488,3.699439301082279e-05,,,,,
52
+ 50,,,,,,,,1.213953027450203,12250,0.0806,1.907565951347351,3.6668405267961925e-05,,,,,
53
+ 51,,,,,,,,1.2387275790308196,12500,0.0832,0.8397055268287659,3.634241752510106e-05,,,,,
54
+ 52,,,,,,,,1.263502130611436,12750,0.0877,0.14878003299236298,3.601642978224019e-05,,,,,
55
+ 53,,,,,,,,1.2882766821920524,13000,0.0866,0.009639033116400242,3.569044203937932e-05,,,,,
56
+ 54,,,,,,,,1.3130512337726687,13250,0.0791,0.03321487829089165,3.536445429651845e-05,,,,,
57
+ 55,,,,,,,,1.3378257853532851,13500,0.0809,0.006037577521055937,3.5038466553657584e-05,,,,,
58
+ 56,,,,,,,,1.3626003369339015,13750,0.0854,0.018767058849334717,3.471247881079672e-05,,,,,
59
+ 57,,,,,,,,1.3873748885145178,14000,0.0922,0.07840000838041306,3.438649106793585e-05,,,,,
60
+ 58,,,,,,,,1.4121494400951342,14250,0.0837,0.0023212512023746967,3.406050332507498e-05,,,,,
61
+ 59,,,,,,,,1.4369239916757506,14500,0.0848,0.005713989492505789,3.373451558221411e-05,,,,,
62
+ 60,,,,,,,,1.4616985432563672,14750,0.0858,0.5069279670715332,3.340852783935324e-05,,,,,
63
+ 61,,,,,,,,1.4864730948369835,15000,0.079,0.7604624032974243,3.308254009649238e-05,,,,,
64
+ 62,,,,,,,,1.5112476464176,15250,0.0929,0.8304641246795654,3.2756552353631505e-05,,,,,
65
+ 63,,,,,,,,1.5360221979982163,15500,0.0883,0.7251547574996948,3.243186856174208e-05,,,,,
66
+ 64,,,,,,,,1.5607967495788326,15750,0.0778,1.3709897994995117,3.210588081888121e-05,,,,,
67
+ 65,,,,,,,,1.585571301159449,16000,0.0802,2.1567022800445557,3.1779893076020344e-05,,,,,
68
+ 66,,,,,,,,1.6103458527400654,16250,0.0831,0.09836085140705109,3.145390533315947e-05,,,,,
69
+ 67,,,,,,,,1.635120404320682,16500,0.0795,0.4006144106388092,3.1127917590298606e-05,,,,,
70
+ 68,,,,,,,,1.659894955901298,16750,0.0856,0.2960919439792633,3.080192984743774e-05,,,,,
71
+ 69,,,,,,,,1.6846695074819147,17000,0.0889,0.02061443217098713,3.0475942104576872e-05,,,,,
72
+ 70,,,,,,,,1.7094440590625308,17250,0.0754,0.7631092071533203,3.0149954361716e-05,,,,,
73
+ 71,,,,,,,,1.7342186106431474,17500,0.0775,0.6703758239746094,2.982396661885513e-05,,,,,
74
+ 72,,,,,,,,1.7589931622237638,17750,0.0817,0.756374716758728,2.9497978875994265e-05,,,,,
75
+ 73,,,,,,,,1.7837677138043802,18000,0.0876,0.5492820739746094,2.9171991133133396e-05,,,,,
76
+ 74,,,,,,,,1.8085422653849965,18250,0.0882,0.3442634642124176,2.8846003390272524e-05,,,,,
77
+ 75,,,,,,,,1.8333168169656129,18500,0.0861,3.4848599433898926,2.8520015647411658e-05,,,,,
78
+ 76,,,,,,,,1.8580913685462295,18750,0.0921,0.46571600437164307,2.819402790455079e-05,,,,,
79
+ 77,,,,,,,,1.8828659201268456,19000,0.0792,0.221365824341774,2.7868040161689924e-05,,,,,
80
+ 78,,,,,,,,1.9076404717074622,19250,0.0849,0.819985032081604,2.754205241882905e-05,,,,,
81
+ 79,,,,,,,,1.9324150232880783,19500,0.0879,0.06596075743436813,2.7216064675968182e-05,,,,,
82
+ 80,,,,,,,,1.957189574868695,19750,0.0855,0.5881988406181335,2.689138088407876e-05,,,,,
83
+ 81,,,,,,,,1.9819641264493113,20000,0.0782,0.5516498684883118,2.656539314121789e-05,,,,,
84
+ 82,0.053498007357120514,0.54,0.9765167610159038,0.54,39.3735,1.27,1.27,2.0,20182,,,,,,,,
85
+ 83,,,,,,,,2.0067386780299277,20250,0.0736,0.7942800521850586,2.6239405398357025e-05,,,,,
86
+ 84,,,,,,,,2.0315132296105443,20500,0.0626,0.0008877600193955004,2.5913417655496153e-05,,,,,
87
+ 85,,,,,,,,2.0562877811911604,20750,0.0667,1.7306857109069824,2.5587429912635284e-05,,,,,
88
+ 86,,,,,,,,2.081062332771777,21000,0.0611,0.08003373444080353,2.526144216977442e-05,,,,,
89
+ 87,,,,,,,,2.105836884352393,21250,0.0651,0.6958248019218445,2.493545442691355e-05,,,,,
90
+ 88,,,,,,,,2.1306114359330097,21500,0.0698,0.6856549978256226,2.460946668405268e-05,,,,,
91
+ 89,,,,,,,,2.155385987513626,21750,0.0619,0.28693079948425293,2.4283478941191812e-05,,,,,
92
+ 90,,,,,,,,2.1801605390942425,22000,0.0698,0.21666885912418365,2.3957491198330946e-05,,,,,
93
+ 91,,,,,,,,2.2049350906748586,22250,0.0668,0.8755005598068237,2.3631503455470074e-05,,,,,
94
+ 92,,,,,,,,2.229709642255475,22500,0.0713,0.23723356425762177,2.3305515712609208e-05,,,,,
95
+ 93,,,,,,,,2.2544841938360918,22750,0.0593,0.7852942943572998,2.297952796974834e-05,,,,,
96
+ 94,,,,,,,,2.279258745416708,23000,0.067,1.1780858039855957,2.265354022688747e-05,,,,,
97
+ 95,,,,,,,,2.3040332969973245,23250,0.0691,0.7586830854415894,2.23275524840266e-05,,,,,
98
+ 96,,,,,,,,2.3288078485779407,23500,0.0673,0.272504061460495,2.2001564741165732e-05,,,,,
99
+ 97,,,,,,,,2.3535824001585572,23750,0.0702,0.830717146396637,2.1676880949276307e-05,,,,,
100
+ 98,,,,,,,,2.3783569517391734,24000,0.0693,1.0476861000061035,2.135089320641544e-05,,,,,
101
+ 99,,,,,,,,2.40313150331979,24250,0.0601,0.2545061409473419,2.102490546355457e-05,,,,,
102
+ 100,,,,,,,,2.427906054900406,24500,0.0645,0.5762472152709961,2.0698917720693703e-05,,,,,
103
+ 101,,,,,,,,2.4526806064810227,24750,0.0754,0.5169177055358887,2.0372929977832834e-05,,,,,
104
+ 102,,,,,,,,2.4774551580616393,25000,0.0648,0.3621724545955658,2.0046942234971965e-05,,,,,
105
+ 103,,,,,,,,2.5022297096422554,25250,0.071,0.9184062480926514,1.97209544921111e-05,,,,,
106
+ 104,,,,,,,,2.527004261222872,25500,0.0733,0.05600641295313835,1.939496674925023e-05,,,,,
107
+ 105,,,,,,,,2.551778812803488,25750,0.0667,0.017861563712358475,1.9070282957360805e-05,,,,,
108
+ 106,,,,,,,,2.5765533643841048,26000,0.0595,1.1122207641601562,1.8744295214499936e-05,,,,,
109
+ 107,,,,,,,,2.601327915964721,26250,0.0657,0.022980080917477608,1.841961142261051e-05,,,,,
110
+ 108,,,,,,,,2.6261024675453375,26500,0.0718,0.4926201105117798,1.809362367974964e-05,,,,,
111
+ 109,,,,,,,,2.6508770191259536,26750,0.0633,0.6802681088447571,1.7767635936888775e-05,,,,,
112
+ 110,,,,,,,,2.6756515707065702,27000,0.063,0.33439555764198303,1.7441648194027903e-05,,,,,
113
+ 111,,,,,,,,2.700426122287187,27250,0.067,0.07250916957855225,1.7115660451167038e-05,,,,,
114
+ 112,,,,,,,,2.725200673867803,27500,0.0678,0.013326438143849373,1.678967270830617e-05,,,,,
115
+ 113,,,,,,,,2.7499752254484195,27750,0.0651,0.48589280247688293,1.64636849654453e-05,,,,,
116
+ 114,,,,,,,,2.7747497770290357,28000,0.0582,0.0067026917822659016,1.613769722258443e-05,,,,,
117
+ 115,,,,,,,,2.7995243286096523,28250,0.0682,0.0031660799868404865,1.5811709479723562e-05,,,,,
118
+ 116,,,,,,,,2.8242988801902684,28500,0.0648,1.3722319602966309,1.5485721736862693e-05,,,,,
119
+ 117,,,,,,,,2.849073431770885,28750,0.0687,0.9632564187049866,1.5159733994001826e-05,,,,,
120
+ 118,,,,,,,,2.873847983351501,29000,0.0567,0.3386249244213104,1.483374625114096e-05,,,,,
121
+ 119,,,,,,,,2.8986225349321177,29250,0.0679,0.33812034130096436,1.450775850828009e-05,,,,,
122
+ 120,,,,,,,,2.9233970865127343,29500,0.0609,0.356045663356781,1.4181770765419222e-05,,,,,
123
+ 121,,,,,,,,2.9481716380933505,29750,0.0675,0.031641993671655655,1.3855783022558353e-05,,,,,
124
+ 122,,,,,,,,2.972946189673967,30000,0.0637,0.0011654272675514221,1.3529795279697486e-05,,,,,
125
+ 123,,,,,,,,2.997720741254583,30250,0.0642,0.00022352008090820163,1.3203807536836615e-05,,,,,
126
+ 124,0.05629619210958481,0.54,0.9778917608241781,0.54,39.6277,1.262,1.262,3.0,30273,,,,,,,,
127
+ 125,,,,,,,,3.0224952928352,30500,0.0566,0.7295501828193665,1.2879123744947191e-05,,,,,
128
+ 126,,,,,,,,3.047269844415816,30750,0.055,2.9202334880828857,1.2553136002086322e-05,,,,,
129
+ 127,,,,,,,,3.0720443959964325,31000,0.0614,0.2907790243625641,1.2227148259225455e-05,,,,,
130
+ 128,,,,,,,,3.0968189475770487,31250,0.0532,0.0033611482940614223,1.1901160516364586e-05,,,,,
131
+ 129,,,,,,,,3.1215934991576653,31500,0.0478,0.5726149678230286,1.1575172773503717e-05,,,,,
132
+ 130,,,,,,,,3.146368050738282,31750,0.0544,0.8363867402076721,1.124918503064285e-05,,,,,
133
+ 131,,,,,,,,3.171142602318898,32000,0.0594,0.7790529727935791,1.092319728778198e-05,,,,,
134
+ 132,,,,,,,,3.1959171538995146,32250,0.0544,0.010002855211496353,1.0597209544921112e-05,,,,,
135
+ 133,,,,,,,,3.2206917054801307,32500,0.0538,0.02614864706993103,1.0271221802060243e-05,,,,,
136
+ 134,,,,,,,,3.2454662570607473,32750,0.0492,1.2334165573120117,9.945234059199376e-06,,,,,
137
+ 135,,,,,,,,3.2702408086413635,33000,0.0557,0.4991805851459503,9.619246316338507e-06,,,,,
138
+ 136,,,,,,,,3.29501536022198,33250,0.0522,0.7711235284805298,9.29456252444908e-06,,,,,
139
+ 137,,,,,,,,3.319789911802596,33500,0.047,0.45638665556907654,8.968574781588213e-06,,,,,
140
+ 138,,,,,,,,3.344564463383213,33750,0.0542,0.9393787384033203,8.642587038727344e-06,,,,,
141
+ 139,,,,,,,,3.3693390149638294,34000,0.0506,0.20903021097183228,8.316599295866476e-06,,,,,
142
+ 140,,,,,,,,3.3941135665444455,34250,0.0508,0.5082834959030151,7.990611553005607e-06,,,,,
143
+ 141,,,,,,,,3.418888118125062,34500,0.0474,0.20875811576843262,7.66462381014474e-06,,,,,
144
+ 142,,,,,,,,3.4436626697056782,34750,0.0497,0.28049978613853455,7.33863606728387e-06,,,,,
145
+ 143,,,,,,,,3.468437221286295,35000,0.054,0.8045966625213623,7.0126483244230014e-06,,,,,
146
+ 144,,,,,,,,3.493211772866911,35250,0.0463,1.8549096584320068,6.686660581562133e-06,,,,,
147
+ 145,,,,,,,,3.5179863244475276,35500,0.0493,0.01312224566936493,6.363280740644152e-06,,,,,
148
+ 146,,,,,,,,3.5427608760281437,35750,0.049,1.024755835533142,6.037292997783284e-06,,,,,
149
+ 147,,,,,,,,3.5675354276087603,36000,0.0495,1.1740553379058838,5.711305254922415e-06,,,,,
150
+ 148,,,,,,,,3.592309979189377,36250,0.0537,0.3666393458843231,5.385317512061547e-06,,,,,
151
+ 149,,,,,,,,3.617084530769993,36500,0.0483,1.2278428077697754,5.059329769200678e-06,,,,,
152
+ 150,,,,,,,,3.641859082350609,36750,0.0507,1.8344718217849731,4.73334202633981e-06,,,,,
153
+ 151,,,,,,,,3.6666336339312258,37000,0.0489,1.3024396896362305,4.407354283478941e-06,,,,,
154
+ 152,,,,,,,,3.6914081855118424,37250,0.0529,0.5916293859481812,4.081366540618073e-06,,,,,
155
+ 153,,,,,,,,3.7161827370924585,37500,0.0522,0.7963153719902039,3.7553787977572044e-06,,,,,
156
+ 154,,,,,,,,3.740957288673075,37750,0.0495,1.1936062574386597,3.429391054896336e-06,,,,,
157
+ 155,,,,,,,,3.7657318402536912,38000,0.05,0.0045746322721242905,3.1034033120354673e-06,,,,,
158
+ 156,,,,,,,,3.790506391834308,38250,0.0525,0.3080344498157501,2.777415569174599e-06,,,,,
159
+ 157,,,,,,,,3.8152809434149244,38500,0.0497,0.7108872532844543,2.4514278263137307e-06,,,,,
160
+ 158,,,,,,,,3.8400554949955406,38750,0.0529,0.5039321184158325,2.1254400834528626e-06,,,,,
161
+ 159,,,,,,,,3.8648300465761567,39000,0.0515,0.45206305384635925,1.7994523405919939e-06,,,,,
162
+ 160,,,,,,,,3.8896045981567733,39250,0.0451,0.5372409820556641,1.4734645977311256e-06,,,,,
163
+ 161,,,,,,,,3.91437914973739,39500,0.0579,0.23797687888145447,1.147476854870257e-06,,,,,
164
+ 162,,,,,,,,3.939153701318006,39750,0.0487,0.8789421319961548,8.22793062980832e-07,,,,,
165
+ 163,,,,,,,,3.9639282528986226,40000,0.0492,0.1474274843931198,4.968053201199636e-07,,,,,
166
+ 164,,,,,,,,3.9887028044792388,40250,0.0538,0.007477769162505865,1.7081757725909506e-07,,,,,
167
+ 165,0.05352379381656647,0.54,0.9770745948806367,0.54,39.8893,1.253,1.253,4.0,40364,,,,,,,,
168
+ 166,,,,,,,,4.0,40364,,,,6734.6117,5.994,5.994,3.153073337044746e+17,0.08114129684687535
training-logs.json ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"eval_loss":1.0263168812,"eval_binary":0.0,"eval_rouge":0.8630774331,"eval_llm_as_a_judge":0.0,"eval_runtime":49.0875,"eval_samples_per_second":1.019,"eval_steps_per_second":1.019,"epoch":0.0,"step":0,"loss":null,"grad_norm":null,"learning_rate":null,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
2
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.0247745516,"step":250,"loss":0.5324,"grad_norm":1.523032546,"learning_rate":0.0000060674,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
3
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.0495491032,"step":500,"loss":0.1588,"grad_norm":1.861987114,"learning_rate":0.0000122585,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
4
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.0743236547,"step":750,"loss":0.1432,"grad_norm":0.0460563935,"learning_rate":0.0000184497,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
5
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.0990982063,"step":1000,"loss":0.1222,"grad_norm":0.7678105831,"learning_rate":0.0000246409,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
6
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.1238727579,"step":1250,"loss":0.1289,"grad_norm":0.0857704207,"learning_rate":0.0000308321,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
7
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.1486473095,"step":1500,"loss":0.123,"grad_norm":0.9776291847,"learning_rate":0.0000370233,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
8
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.1734218611,"step":1750,"loss":0.1247,"grad_norm":0.6648195386,"learning_rate":0.0000432145,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
9
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.1981964126,"step":2000,"loss":0.1222,"grad_norm":0.1074415967,"learning_rate":0.0000494056,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
10
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.2229709642,"step":2250,"loss":0.1197,"grad_norm":0.4264952838,"learning_rate":0.0000497053,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
11
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.2477455158,"step":2500,"loss":0.1319,"grad_norm":0.7932649851,"learning_rate":0.0000493793,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
12
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.2725200674,"step":2750,"loss":0.124,"grad_norm":0.0169372503,"learning_rate":0.0000490533,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
13
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.297294619,"step":3000,"loss":0.112,"grad_norm":1.0842401981,"learning_rate":0.0000487273,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
14
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.3220691705,"step":3250,"loss":0.1105,"grad_norm":0.0589365289,"learning_rate":0.0000484014,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
15
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.3468437221,"step":3500,"loss":0.117,"grad_norm":2.2370946407,"learning_rate":0.0000480754,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
16
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.3716182737,"step":3750,"loss":0.1142,"grad_norm":0.2535243928,"learning_rate":0.0000477494,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
17
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.3963928253,"step":4000,"loss":0.115,"grad_norm":0.7460672259,"learning_rate":0.0000474234,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
18
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.4211673769,"step":4250,"loss":0.1315,"grad_norm":0.5937117338,"learning_rate":0.0000470974,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
19
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.4459419285,"step":4500,"loss":0.1148,"grad_norm":0.0067167096,"learning_rate":0.0000467714,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
20
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.47071648,"step":4750,"loss":0.1094,"grad_norm":0.6822088361,"learning_rate":0.0000464454,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
21
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.4954910316,"step":5000,"loss":0.1123,"grad_norm":0.5732296705,"learning_rate":0.0000461194,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
22
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.5202655832,"step":5250,"loss":0.105,"grad_norm":1.0334236622,"learning_rate":0.0000457935,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
23
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.5450401348,"step":5500,"loss":0.1097,"grad_norm":0.1588985473,"learning_rate":0.0000454675,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
24
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.5698146864,"step":5750,"loss":0.1125,"grad_norm":1.1788893938,"learning_rate":0.0000451415,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
25
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.5945892379,"step":6000,"loss":0.1078,"grad_norm":0.230259642,"learning_rate":0.0000448155,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
26
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.6193637895,"step":6250,"loss":0.1026,"grad_norm":0.9611893892,"learning_rate":0.0000444895,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
27
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.6441383411,"step":6500,"loss":0.0974,"grad_norm":0.0160588399,"learning_rate":0.0000441635,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
28
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.6689128927,"step":6750,"loss":0.0965,"grad_norm":0.3839409649,"learning_rate":0.0000438375,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
29
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.6936874443,"step":7000,"loss":0.1098,"grad_norm":0.3105497658,"learning_rate":0.0000435115,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
30
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.7184619958,"step":7250,"loss":0.1116,"grad_norm":0.0435747579,"learning_rate":0.0000431856,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
31
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.7432365474,"step":7500,"loss":0.1061,"grad_norm":0.265334934,"learning_rate":0.0000428596,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
32
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.768011099,"step":7750,"loss":0.1009,"grad_norm":0.647885263,"learning_rate":0.0000425336,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
33
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.7927856506,"step":8000,"loss":0.0929,"grad_norm":1.2587280273,"learning_rate":0.0000422076,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
34
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.8175602022,"step":8250,"loss":0.0992,"grad_norm":0.7607027888,"learning_rate":0.0000418816,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
35
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.8423347537,"step":8500,"loss":0.097,"grad_norm":0.0234140866,"learning_rate":0.0000415556,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
36
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.8671093053,"step":8750,"loss":0.0974,"grad_norm":1.6173541546,"learning_rate":0.0000412296,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
37
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.8918838569,"step":9000,"loss":0.0992,"grad_norm":0.3484966755,"learning_rate":0.0000409036,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
38
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.9166584085,"step":9250,"loss":0.1068,"grad_norm":0.4587571025,"learning_rate":0.0000405777,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
39
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.9414329601,"step":9500,"loss":0.0989,"grad_norm":0.5602991581,"learning_rate":0.0000402517,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
40
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.9662075116,"step":9750,"loss":0.0957,"grad_norm":0.3049614131,"learning_rate":0.0000399257,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
41
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":0.9909820632,"step":10000,"loss":0.1069,"grad_norm":0.5346589684,"learning_rate":0.0000395997,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
42
+ {"eval_loss":0.0627685115,"eval_binary":0.5,"eval_rouge":0.9718670465,"eval_llm_as_a_judge":0.5,"eval_runtime":39.1908,"eval_samples_per_second":1.276,"eval_steps_per_second":1.276,"epoch":1.0,"step":10091,"loss":null,"grad_norm":null,"learning_rate":null,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
43
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.0157566148,"step":10250,"loss":0.093,"grad_norm":1.1988984346,"learning_rate":0.0000392737,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
44
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.0405311664,"step":10500,"loss":0.0856,"grad_norm":0.5537581444,"learning_rate":0.0000389477,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
45
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.065305718,"step":10750,"loss":0.0932,"grad_norm":0.7537667155,"learning_rate":0.0000386217,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
46
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.0900802695,"step":11000,"loss":0.0842,"grad_norm":0.5561412573,"learning_rate":0.0000382957,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
47
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.1148548211,"step":11250,"loss":0.0867,"grad_norm":0.6610764861,"learning_rate":0.0000379711,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
48
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.1396293727,"step":11500,"loss":0.08,"grad_norm":2.3849334717,"learning_rate":0.0000376451,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
49
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.1644039243,"step":11750,"loss":0.0848,"grad_norm":0.4288468063,"learning_rate":0.0000373191,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
50
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.1891784759,"step":12000,"loss":0.0825,"grad_norm":1.3528180122,"learning_rate":0.0000369944,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
51
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.2139530275,"step":12250,"loss":0.0806,"grad_norm":1.9075659513,"learning_rate":0.0000366684,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
52
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.238727579,"step":12500,"loss":0.0832,"grad_norm":0.8397055268,"learning_rate":0.0000363424,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
53
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.2635021306,"step":12750,"loss":0.0877,"grad_norm":0.148780033,"learning_rate":0.0000360164,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
54
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.2882766822,"step":13000,"loss":0.0866,"grad_norm":0.0096390331,"learning_rate":0.0000356904,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
55
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.3130512338,"step":13250,"loss":0.0791,"grad_norm":0.0332148783,"learning_rate":0.0000353645,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
56
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.3378257854,"step":13500,"loss":0.0809,"grad_norm":0.0060375775,"learning_rate":0.0000350385,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
57
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.3626003369,"step":13750,"loss":0.0854,"grad_norm":0.0187670588,"learning_rate":0.0000347125,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
58
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.3873748885,"step":14000,"loss":0.0922,"grad_norm":0.0784000084,"learning_rate":0.0000343865,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
59
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.4121494401,"step":14250,"loss":0.0837,"grad_norm":0.0023212512,"learning_rate":0.0000340605,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
60
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.4369239917,"step":14500,"loss":0.0848,"grad_norm":0.0057139895,"learning_rate":0.0000337345,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
61
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.4616985433,"step":14750,"loss":0.0858,"grad_norm":0.5069279671,"learning_rate":0.0000334085,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
62
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.4864730948,"step":15000,"loss":0.079,"grad_norm":0.7604624033,"learning_rate":0.0000330825,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
63
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.5112476464,"step":15250,"loss":0.0929,"grad_norm":0.8304641247,"learning_rate":0.0000327566,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
64
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.536022198,"step":15500,"loss":0.0883,"grad_norm":0.7251547575,"learning_rate":0.0000324319,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
65
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.5607967496,"step":15750,"loss":0.0778,"grad_norm":1.3709897995,"learning_rate":0.0000321059,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
66
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.5855713012,"step":16000,"loss":0.0802,"grad_norm":2.15670228,"learning_rate":0.0000317799,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
67
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.6103458527,"step":16250,"loss":0.0831,"grad_norm":0.0983608514,"learning_rate":0.0000314539,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
68
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.6351204043,"step":16500,"loss":0.0795,"grad_norm":0.4006144106,"learning_rate":0.0000311279,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
69
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.6598949559,"step":16750,"loss":0.0856,"grad_norm":0.296091944,"learning_rate":0.0000308019,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
70
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.6846695075,"step":17000,"loss":0.0889,"grad_norm":0.0206144322,"learning_rate":0.0000304759,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
71
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.7094440591,"step":17250,"loss":0.0754,"grad_norm":0.7631092072,"learning_rate":0.00003015,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
72
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.7342186106,"step":17500,"loss":0.0775,"grad_norm":0.670375824,"learning_rate":0.000029824,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
73
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.7589931622,"step":17750,"loss":0.0817,"grad_norm":0.7563747168,"learning_rate":0.000029498,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
74
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.7837677138,"step":18000,"loss":0.0876,"grad_norm":0.549282074,"learning_rate":0.000029172,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
75
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.8085422654,"step":18250,"loss":0.0882,"grad_norm":0.3442634642,"learning_rate":0.000028846,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
76
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.833316817,"step":18500,"loss":0.0861,"grad_norm":3.4848599434,"learning_rate":0.00002852,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
77
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.8580913685,"step":18750,"loss":0.0921,"grad_norm":0.4657160044,"learning_rate":0.000028194,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
78
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.8828659201,"step":19000,"loss":0.0792,"grad_norm":0.2213658243,"learning_rate":0.000027868,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
79
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.9076404717,"step":19250,"loss":0.0849,"grad_norm":0.8199850321,"learning_rate":0.0000275421,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
80
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.9324150233,"step":19500,"loss":0.0879,"grad_norm":0.0659607574,"learning_rate":0.0000272161,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
81
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.9571895749,"step":19750,"loss":0.0855,"grad_norm":0.5881988406,"learning_rate":0.0000268914,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
82
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":1.9819641264,"step":20000,"loss":0.0782,"grad_norm":0.5516498685,"learning_rate":0.0000265654,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
83
+ {"eval_loss":0.0534980074,"eval_binary":0.54,"eval_rouge":0.976516761,"eval_llm_as_a_judge":0.54,"eval_runtime":39.3735,"eval_samples_per_second":1.27,"eval_steps_per_second":1.27,"epoch":2.0,"step":20182,"loss":null,"grad_norm":null,"learning_rate":null,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
84
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.006738678,"step":20250,"loss":0.0736,"grad_norm":0.7942800522,"learning_rate":0.0000262394,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
85
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.0315132296,"step":20500,"loss":0.0626,"grad_norm":0.00088776,"learning_rate":0.0000259134,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
86
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.0562877812,"step":20750,"loss":0.0667,"grad_norm":1.7306857109,"learning_rate":0.0000255874,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
87
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.0810623328,"step":21000,"loss":0.0611,"grad_norm":0.0800337344,"learning_rate":0.0000252614,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
88
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.1058368844,"step":21250,"loss":0.0651,"grad_norm":0.6958248019,"learning_rate":0.0000249355,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
89
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.1306114359,"step":21500,"loss":0.0698,"grad_norm":0.6856549978,"learning_rate":0.0000246095,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
90
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.1553859875,"step":21750,"loss":0.0619,"grad_norm":0.2869307995,"learning_rate":0.0000242835,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
91
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.1801605391,"step":22000,"loss":0.0698,"grad_norm":0.2166688591,"learning_rate":0.0000239575,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
92
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.2049350907,"step":22250,"loss":0.0668,"grad_norm":0.8755005598,"learning_rate":0.0000236315,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
93
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.2297096423,"step":22500,"loss":0.0713,"grad_norm":0.2372335643,"learning_rate":0.0000233055,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
94
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.2544841938,"step":22750,"loss":0.0593,"grad_norm":0.7852942944,"learning_rate":0.0000229795,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
95
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.2792587454,"step":23000,"loss":0.067,"grad_norm":1.178085804,"learning_rate":0.0000226535,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
96
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.304033297,"step":23250,"loss":0.0691,"grad_norm":0.7586830854,"learning_rate":0.0000223276,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
97
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.3288078486,"step":23500,"loss":0.0673,"grad_norm":0.2725040615,"learning_rate":0.0000220016,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
98
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.3535824002,"step":23750,"loss":0.0702,"grad_norm":0.8307171464,"learning_rate":0.0000216769,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
99
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.3783569517,"step":24000,"loss":0.0693,"grad_norm":1.0476861,"learning_rate":0.0000213509,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
100
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.4031315033,"step":24250,"loss":0.0601,"grad_norm":0.2545061409,"learning_rate":0.0000210249,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
101
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.4279060549,"step":24500,"loss":0.0645,"grad_norm":0.5762472153,"learning_rate":0.0000206989,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
102
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.4526806065,"step":24750,"loss":0.0754,"grad_norm":0.5169177055,"learning_rate":0.0000203729,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
103
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.4774551581,"step":25000,"loss":0.0648,"grad_norm":0.3621724546,"learning_rate":0.0000200469,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
104
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.5022297096,"step":25250,"loss":0.071,"grad_norm":0.9184062481,"learning_rate":0.000019721,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
105
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.5270042612,"step":25500,"loss":0.0733,"grad_norm":0.056006413,"learning_rate":0.000019395,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
106
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.5517788128,"step":25750,"loss":0.0667,"grad_norm":0.0178615637,"learning_rate":0.0000190703,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
107
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.5765533644,"step":26000,"loss":0.0595,"grad_norm":1.1122207642,"learning_rate":0.0000187443,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
108
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.601327916,"step":26250,"loss":0.0657,"grad_norm":0.0229800809,"learning_rate":0.0000184196,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
109
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.6261024675,"step":26500,"loss":0.0718,"grad_norm":0.4926201105,"learning_rate":0.0000180936,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
110
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.6508770191,"step":26750,"loss":0.0633,"grad_norm":0.6802681088,"learning_rate":0.0000177676,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
111
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.6756515707,"step":27000,"loss":0.063,"grad_norm":0.3343955576,"learning_rate":0.0000174416,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
112
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.7004261223,"step":27250,"loss":0.067,"grad_norm":0.0725091696,"learning_rate":0.0000171157,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
113
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.7252006739,"step":27500,"loss":0.0678,"grad_norm":0.0133264381,"learning_rate":0.0000167897,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
114
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.7499752254,"step":27750,"loss":0.0651,"grad_norm":0.4858928025,"learning_rate":0.0000164637,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
115
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.774749777,"step":28000,"loss":0.0582,"grad_norm":0.0067026918,"learning_rate":0.0000161377,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
116
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.7995243286,"step":28250,"loss":0.0682,"grad_norm":0.00316608,"learning_rate":0.0000158117,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
117
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.8242988802,"step":28500,"loss":0.0648,"grad_norm":1.3722319603,"learning_rate":0.0000154857,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
118
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.8490734318,"step":28750,"loss":0.0687,"grad_norm":0.9632564187,"learning_rate":0.0000151597,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
119
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.8738479834,"step":29000,"loss":0.0567,"grad_norm":0.3386249244,"learning_rate":0.0000148337,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
120
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.8986225349,"step":29250,"loss":0.0679,"grad_norm":0.3381203413,"learning_rate":0.0000145078,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
121
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.9233970865,"step":29500,"loss":0.0609,"grad_norm":0.3560456634,"learning_rate":0.0000141818,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
122
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.9481716381,"step":29750,"loss":0.0675,"grad_norm":0.0316419937,"learning_rate":0.0000138558,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
123
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.9729461897,"step":30000,"loss":0.0637,"grad_norm":0.0011654273,"learning_rate":0.0000135298,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
124
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":2.9977207413,"step":30250,"loss":0.0642,"grad_norm":0.0002235201,"learning_rate":0.0000132038,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
125
+ {"eval_loss":0.0562961921,"eval_binary":0.54,"eval_rouge":0.9778917608,"eval_llm_as_a_judge":0.54,"eval_runtime":39.6277,"eval_samples_per_second":1.262,"eval_steps_per_second":1.262,"epoch":3.0,"step":30273,"loss":null,"grad_norm":null,"learning_rate":null,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
126
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.0224952928,"step":30500,"loss":0.0566,"grad_norm":0.7295501828,"learning_rate":0.0000128791,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
127
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.0472698444,"step":30750,"loss":0.055,"grad_norm":2.9202334881,"learning_rate":0.0000125531,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
128
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.072044396,"step":31000,"loss":0.0614,"grad_norm":0.2907790244,"learning_rate":0.0000122271,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
129
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.0968189476,"step":31250,"loss":0.0532,"grad_norm":0.0033611483,"learning_rate":0.0000119012,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
130
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.1215934992,"step":31500,"loss":0.0478,"grad_norm":0.5726149678,"learning_rate":0.0000115752,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
131
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.1463680507,"step":31750,"loss":0.0544,"grad_norm":0.8363867402,"learning_rate":0.0000112492,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
132
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.1711426023,"step":32000,"loss":0.0594,"grad_norm":0.7790529728,"learning_rate":0.0000109232,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
133
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.1959171539,"step":32250,"loss":0.0544,"grad_norm":0.0100028552,"learning_rate":0.0000105972,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
134
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.2206917055,"step":32500,"loss":0.0538,"grad_norm":0.0261486471,"learning_rate":0.0000102712,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
135
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.2454662571,"step":32750,"loss":0.0492,"grad_norm":1.2334165573,"learning_rate":0.0000099452,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
136
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.2702408086,"step":33000,"loss":0.0557,"grad_norm":0.4991805851,"learning_rate":0.0000096192,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
137
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.2950153602,"step":33250,"loss":0.0522,"grad_norm":0.7711235285,"learning_rate":0.0000092946,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
138
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.3197899118,"step":33500,"loss":0.047,"grad_norm":0.4563866556,"learning_rate":0.0000089686,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
139
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.3445644634,"step":33750,"loss":0.0542,"grad_norm":0.9393787384,"learning_rate":0.0000086426,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
140
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.369339015,"step":34000,"loss":0.0506,"grad_norm":0.209030211,"learning_rate":0.0000083166,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
141
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.3941135665,"step":34250,"loss":0.0508,"grad_norm":0.5082834959,"learning_rate":0.0000079906,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
142
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.4188881181,"step":34500,"loss":0.0474,"grad_norm":0.2087581158,"learning_rate":0.0000076646,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
143
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.4436626697,"step":34750,"loss":0.0497,"grad_norm":0.2804997861,"learning_rate":0.0000073386,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
144
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.4684372213,"step":35000,"loss":0.054,"grad_norm":0.8045966625,"learning_rate":0.0000070126,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
145
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.4932117729,"step":35250,"loss":0.0463,"grad_norm":1.8549096584,"learning_rate":0.0000066867,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
146
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.5179863244,"step":35500,"loss":0.0493,"grad_norm":0.0131222457,"learning_rate":0.0000063633,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
147
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.542760876,"step":35750,"loss":0.049,"grad_norm":1.0247558355,"learning_rate":0.0000060373,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
148
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.5675354276,"step":36000,"loss":0.0495,"grad_norm":1.1740553379,"learning_rate":0.0000057113,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
149
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.5923099792,"step":36250,"loss":0.0537,"grad_norm":0.3666393459,"learning_rate":0.0000053853,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
150
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.6170845308,"step":36500,"loss":0.0483,"grad_norm":1.2278428078,"learning_rate":0.0000050593,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
151
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.6418590824,"step":36750,"loss":0.0507,"grad_norm":1.8344718218,"learning_rate":0.0000047333,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
152
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.6666336339,"step":37000,"loss":0.0489,"grad_norm":1.3024396896,"learning_rate":0.0000044074,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
153
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.6914081855,"step":37250,"loss":0.0529,"grad_norm":0.5916293859,"learning_rate":0.0000040814,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
154
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.7161827371,"step":37500,"loss":0.0522,"grad_norm":0.796315372,"learning_rate":0.0000037554,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
155
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.7409572887,"step":37750,"loss":0.0495,"grad_norm":1.1936062574,"learning_rate":0.0000034294,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
156
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.7657318403,"step":38000,"loss":0.05,"grad_norm":0.0045746323,"learning_rate":0.0000031034,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
157
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.7905063918,"step":38250,"loss":0.0525,"grad_norm":0.3080344498,"learning_rate":0.0000027774,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
158
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.8152809434,"step":38500,"loss":0.0497,"grad_norm":0.7108872533,"learning_rate":0.0000024514,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
159
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.840055495,"step":38750,"loss":0.0529,"grad_norm":0.5039321184,"learning_rate":0.0000021254,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
160
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.8648300466,"step":39000,"loss":0.0515,"grad_norm":0.4520630538,"learning_rate":0.0000017995,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
161
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.8896045982,"step":39250,"loss":0.0451,"grad_norm":0.5372409821,"learning_rate":0.0000014735,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
162
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.9143791497,"step":39500,"loss":0.0579,"grad_norm":0.2379768789,"learning_rate":0.0000011475,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
163
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.9391537013,"step":39750,"loss":0.0487,"grad_norm":0.878942132,"learning_rate":0.0000008228,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
164
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.9639282529,"step":40000,"loss":0.0492,"grad_norm":0.1474274844,"learning_rate":0.0000004968,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
165
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":3.9887028045,"step":40250,"loss":0.0538,"grad_norm":0.0074777692,"learning_rate":0.0000001708,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
166
+ {"eval_loss":0.0535237938,"eval_binary":0.54,"eval_rouge":0.9770745949,"eval_llm_as_a_judge":0.54,"eval_runtime":39.8893,"eval_samples_per_second":1.253,"eval_steps_per_second":1.253,"epoch":4.0,"step":40364,"loss":null,"grad_norm":null,"learning_rate":null,"train_runtime":null,"train_samples_per_second":null,"train_steps_per_second":null,"total_flos":null,"train_loss":null}
167
+ {"eval_loss":null,"eval_binary":null,"eval_rouge":null,"eval_llm_as_a_judge":null,"eval_runtime":null,"eval_samples_per_second":null,"eval_steps_per_second":null,"epoch":4.0,"step":40364,"loss":null,"grad_norm":null,"learning_rate":null,"train_runtime":6734.6117,"train_samples_per_second":5.994,"train_steps_per_second":5.994,"total_flos":3.153073337e+17,"train_loss":0.0811412968}
vocab.json ADDED
The diff for this file is too large to render. See raw diff