dump: training run
Browse files- README.md +45 -0
- aliases/best.json +12 -0
- aliases/latest.json +9 -0
- config/config.json +128 -0
- config/train.toml +90 -0
- manifest.json +102 -0
- versions/v001000/manifest.json +50 -0
- versions/v001000/model.safetensors +3 -0
- versions/v001000/opt_shard_rank0000.bin +3 -0
- versions/v001000/rng_rank0000.json +0 -0
- versions/v002000/manifest.json +50 -0
- versions/v002000/model.safetensors +3 -0
- versions/v002000/opt_shard_rank0000.bin +3 -0
- versions/v002000/rng_rank0000.json +0 -0
- versions/v003000/manifest.json +50 -0
- versions/v003000/model.safetensors +3 -0
- versions/v003000/opt_shard_rank0000.bin +3 -0
- versions/v003000/rng_rank0000.json +0 -0
- versions/v004000/manifest.json +50 -0
- versions/v004000/model.safetensors +3 -0
- versions/v004000/opt_shard_rank0000.bin +3 -0
- versions/v004000/rng_rank0000.json +0 -0
- versions/v005000/manifest.json +50 -0
- versions/v005000/model.safetensors +3 -0
- versions/v005000/opt_shard_rank0000.bin +3 -0
- versions/v005000/rng_rank0000.json +0 -0
- versions/v006000/manifest.json +50 -0
- versions/v006000/model.safetensors +3 -0
- versions/v006000/opt_shard_rank0000.bin +3 -0
- versions/v006000/rng_rank0000.json +0 -0
- versions/v007000/manifest.json +50 -0
- versions/v007000/model.safetensors +3 -0
- versions/v007000/opt_shard_rank0000.bin +3 -0
- versions/v007000/rng_rank0000.json +0 -0
- versions/v008000/manifest.json +50 -0
- versions/v008000/model.safetensors +3 -0
- versions/v008000/opt_shard_rank0000.bin +3 -0
- versions/v008000/rng_rank0000.json +0 -0
README.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
datasets:
|
| 4 |
+
- ylecun/mnist
|
| 5 |
+
language:
|
| 6 |
+
- en
|
| 7 |
+
tags:
|
| 8 |
+
- mnist
|
| 9 |
+
- '784'
|
| 10 |
+
- '256'
|
| 11 |
+
- transformerlm
|
| 12 |
+
- flow-matching
|
| 13 |
+
- dit
|
| 14 |
+
---
|
| 15 |
+
# 🧠🌊 TransformerLM (Flow 784, 256) — MNIST
|
| 16 |
+
|
| 17 |
+
Training run artifacts from https://github.com/triloy8/transformerlm: a minimal flow-matching **DiT-style** image model trained on **MNIST** with a **fixed 784-token context** (28×28 image values) and **conditional generation** using discrete labels plus a null label for classifier-free guidance (CFG).
|
| 18 |
+
|
| 19 |
+
## ✅ Key Facts
|
| 20 |
+
|
| 21 |
+
- **Model type:** `image_dit` flow-matching Transformer
|
| 22 |
+
- **Objective:** Flow matching
|
| 23 |
+
- **Dataset:** MNIST (full 8-bit pixel values, 256 levels)
|
| 24 |
+
- **Context length:** 784 values (28×28 image)
|
| 25 |
+
- **Layers:** 8
|
| 26 |
+
- **Heads:** 16
|
| 27 |
+
- **d_model:** 256
|
| 28 |
+
- **d_ff:** 1024
|
| 29 |
+
- **Training setup:** Single NVIDIA A40 (48GB)
|
| 30 |
+
- **Runtime:** ~3 hours ⏱️
|
| 31 |
+
|
| 32 |
+
## 📦 What’s Inside
|
| 33 |
+
|
| 34 |
+
- 8k steps (full run), including:
|
| 35 |
+
- Optimizer state
|
| 36 |
+
- RNG state
|
| 37 |
+
- Safetensors weights
|
| 38 |
+
- Run config
|
| 39 |
+
- Best checkpoint alias (`v007000`)
|
| 40 |
+
- Latest checkpoint alias (`v008000`)
|
| 41 |
+
|
| 42 |
+
## 🚀 Reproducibility
|
| 43 |
+
|
| 44 |
+
Exact run dumped from:
|
| 45 |
+
`https://github.com/triloy8/transformerlm/commit/01459662f08e83abc997966415d648563860859e`
|
aliases/best.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alias": "best",
|
| 3 |
+
"manifest_key": "runs/2026-03-08_12-52-45/versions/v007000/manifest.json",
|
| 4 |
+
"metric_name": "val_loss",
|
| 5 |
+
"mode": "min",
|
| 6 |
+
"run_id": "2026-03-08_12-52-45",
|
| 7 |
+
"schema_version": 1,
|
| 8 |
+
"status": "active",
|
| 9 |
+
"step": 7000,
|
| 10 |
+
"value": 0.15839263796806335,
|
| 11 |
+
"version_id": "v007000"
|
| 12 |
+
}
|
aliases/latest.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alias": "latest",
|
| 3 |
+
"manifest_key": "runs/2026-03-08_12-52-45/versions/v008000/manifest.json",
|
| 4 |
+
"run_id": "2026-03-08_12-52-45",
|
| 5 |
+
"schema_version": 1,
|
| 6 |
+
"status": "active",
|
| 7 |
+
"step": 8000,
|
| 8 |
+
"version_id": "v008000"
|
| 9 |
+
}
|
config/config.json
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"checkpointing": {
|
| 3 |
+
"best_metric_name": "val_loss",
|
| 4 |
+
"best_mode": "min",
|
| 5 |
+
"ckpting_save_iter": 1000,
|
| 6 |
+
"enabled": true,
|
| 7 |
+
"resume_from": null,
|
| 8 |
+
"resume_optimizer": true,
|
| 9 |
+
"run_id": null
|
| 10 |
+
},
|
| 11 |
+
"compile": null,
|
| 12 |
+
"data": {
|
| 13 |
+
"cache_all": true,
|
| 14 |
+
"dataset_config": null,
|
| 15 |
+
"dataset_name": "ylecun/mnist",
|
| 16 |
+
"megatron_train_prefix": null,
|
| 17 |
+
"megatron_val_prefix": null,
|
| 18 |
+
"pad_random_shift": false,
|
| 19 |
+
"pad_token_id": null,
|
| 20 |
+
"pipeline_mode": "mnist",
|
| 21 |
+
"runs_path": "runs",
|
| 22 |
+
"shuffle_buffer_size": 0,
|
| 23 |
+
"shuffle_seed": 3407,
|
| 24 |
+
"text_field": "image",
|
| 25 |
+
"tokenizer": null,
|
| 26 |
+
"train_split": "train",
|
| 27 |
+
"val_split": "test"
|
| 28 |
+
},
|
| 29 |
+
"ddp": {
|
| 30 |
+
"backend": "nccl",
|
| 31 |
+
"bucket_size_mb": 200,
|
| 32 |
+
"master_addr": "127.0.0.1",
|
| 33 |
+
"master_port": "29500",
|
| 34 |
+
"nccl_p2p_disable": true,
|
| 35 |
+
"node_rank": 0,
|
| 36 |
+
"num_gpus_per_node": 1,
|
| 37 |
+
"num_nodes": 1
|
| 38 |
+
},
|
| 39 |
+
"logging": {
|
| 40 |
+
"architecture": "DiTImageFlow",
|
| 41 |
+
"backend": "wandb",
|
| 42 |
+
"dataset": "MNIST",
|
| 43 |
+
"log_activation_norms": false,
|
| 44 |
+
"log_grad_norms": true,
|
| 45 |
+
"log_p_mask_bucket_loss": false,
|
| 46 |
+
"log_weight_norms": true,
|
| 47 |
+
"p_mask_bucket_edges": null,
|
| 48 |
+
"run_name": null,
|
| 49 |
+
"val_log_every": 0,
|
| 50 |
+
"val_log_samples": 0
|
| 51 |
+
},
|
| 52 |
+
"model": {
|
| 53 |
+
"attention_backend": "torch_sdpa",
|
| 54 |
+
"attention_sdp_backend": "auto",
|
| 55 |
+
"context_length": 784,
|
| 56 |
+
"d_ff": 1024,
|
| 57 |
+
"d_model": 256,
|
| 58 |
+
"device": "cuda",
|
| 59 |
+
"dtype": "float32",
|
| 60 |
+
"eot_token_id": null,
|
| 61 |
+
"image_height": 28,
|
| 62 |
+
"image_width": 28,
|
| 63 |
+
"label_vocab_size": 11,
|
| 64 |
+
"mask_token_id": null,
|
| 65 |
+
"model_type": "image_dit",
|
| 66 |
+
"noise_epsilon": 0.001,
|
| 67 |
+
"null_label_id": 10,
|
| 68 |
+
"num_heads": 16,
|
| 69 |
+
"num_layers": 8,
|
| 70 |
+
"pixel_bins": 256,
|
| 71 |
+
"random_trunc_prob": 0.0,
|
| 72 |
+
"rope_theta": 10000.0,
|
| 73 |
+
"use_rope_2d": true,
|
| 74 |
+
"vocab_size": 257
|
| 75 |
+
},
|
| 76 |
+
"optimizer": {
|
| 77 |
+
"betas": [
|
| 78 |
+
0.9,
|
| 79 |
+
0.95
|
| 80 |
+
],
|
| 81 |
+
"cosine_cycle_iters": 60000,
|
| 82 |
+
"eps": 1e-08,
|
| 83 |
+
"grad_clip_max_l2_norm": 3.0,
|
| 84 |
+
"initial_learning_rate": 0.0001,
|
| 85 |
+
"lr_schedule": "constant_with_warmup",
|
| 86 |
+
"max_learning_rate": 0.003,
|
| 87 |
+
"min_learning_rate": 0.0003,
|
| 88 |
+
"muon": null,
|
| 89 |
+
"optimizer_name": "adamw",
|
| 90 |
+
"warmup_iters": 200,
|
| 91 |
+
"weight_decay": 0.1
|
| 92 |
+
},
|
| 93 |
+
"train_infer": null,
|
| 94 |
+
"training": {
|
| 95 |
+
"amp_dtype": "bfloat16",
|
| 96 |
+
"amp_enabled": true,
|
| 97 |
+
"batch_size": 256,
|
| 98 |
+
"deterministic_mask": false,
|
| 99 |
+
"eot_mask_loss": false,
|
| 100 |
+
"grad_accum_steps": 1,
|
| 101 |
+
"joint_alpha_schedule": "constant",
|
| 102 |
+
"joint_alpha_schedule_end": 1.0,
|
| 103 |
+
"joint_alpha_schedule_start": 0.0,
|
| 104 |
+
"joint_diffusion_alpha": 0.3,
|
| 105 |
+
"joint_diffusion_alpha_end": null,
|
| 106 |
+
"max_train_iteration": 120000,
|
| 107 |
+
"max_val_iteration": 10,
|
| 108 |
+
"objective": "flow",
|
| 109 |
+
"p_mask_end": null,
|
| 110 |
+
"p_mask_override": null,
|
| 111 |
+
"p_mask_schedule": "none",
|
| 112 |
+
"p_mask_schedule_end": 1.0,
|
| 113 |
+
"p_mask_schedule_start": 0.0,
|
| 114 |
+
"p_mask_start": null,
|
| 115 |
+
"repeat_masking_seed": null,
|
| 116 |
+
"seed": 3407,
|
| 117 |
+
"skip_validation": false,
|
| 118 |
+
"train_loss_ema_decay": 0.99,
|
| 119 |
+
"uncond_label_dropout_prob": 0.1,
|
| 120 |
+
"val_freq_iteration": 250
|
| 121 |
+
},
|
| 122 |
+
"wandb": {
|
| 123 |
+
"architecture": null,
|
| 124 |
+
"dataset": null,
|
| 125 |
+
"entity": "yiltro8-org",
|
| 126 |
+
"project": "mnist_flow"
|
| 127 |
+
}
|
| 128 |
+
}
|
config/train.toml
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[model]
|
| 2 |
+
model_type = "image_dit"
|
| 3 |
+
label_vocab_size = 11
|
| 4 |
+
vocab_size = 257
|
| 5 |
+
pixel_bins = 256
|
| 6 |
+
context_length = 784
|
| 7 |
+
d_model = 256
|
| 8 |
+
num_layers = 8
|
| 9 |
+
num_heads = 16
|
| 10 |
+
d_ff = 1024
|
| 11 |
+
rope_theta = 10000.0
|
| 12 |
+
attention_backend = "torch_sdpa"
|
| 13 |
+
attention_sdp_backend = "auto"
|
| 14 |
+
device = "cuda"
|
| 15 |
+
dtype = "float32"
|
| 16 |
+
null_label_id = 10
|
| 17 |
+
random_trunc_prob = 0.0
|
| 18 |
+
use_rope_2d = true
|
| 19 |
+
image_height = 28
|
| 20 |
+
image_width = 28
|
| 21 |
+
|
| 22 |
+
[optimizer]
|
| 23 |
+
optimizer_name = "adamw"
|
| 24 |
+
betas = [0.9, 0.95]
|
| 25 |
+
eps = 1e-8
|
| 26 |
+
weight_decay = 0.1
|
| 27 |
+
initial_learning_rate = 0.0001
|
| 28 |
+
max_learning_rate = 0.003
|
| 29 |
+
min_learning_rate = 0.0003
|
| 30 |
+
warmup_iters = 200
|
| 31 |
+
cosine_cycle_iters = 60000
|
| 32 |
+
grad_clip_max_l2_norm = 3.0
|
| 33 |
+
lr_schedule = "constant_with_warmup"
|
| 34 |
+
|
| 35 |
+
[training]
|
| 36 |
+
batch_size = 256
|
| 37 |
+
max_train_iteration = 120000
|
| 38 |
+
max_val_iteration = 10
|
| 39 |
+
val_freq_iteration = 250
|
| 40 |
+
seed = 3407
|
| 41 |
+
skip_validation = false
|
| 42 |
+
grad_accum_steps = 1
|
| 43 |
+
train_loss_ema_decay = 0.99
|
| 44 |
+
amp_enabled = true
|
| 45 |
+
amp_dtype = "bfloat16"
|
| 46 |
+
objective = "flow"
|
| 47 |
+
uncond_label_dropout_prob = 0.1
|
| 48 |
+
|
| 49 |
+
[data]
|
| 50 |
+
runs_path = "./runs"
|
| 51 |
+
dataset_name = "ylecun/mnist"
|
| 52 |
+
train_split = "train"
|
| 53 |
+
val_split = "test"
|
| 54 |
+
text_field = "image"
|
| 55 |
+
pipeline_mode = "mnist"
|
| 56 |
+
shuffle_buffer_size = 0
|
| 57 |
+
cache_all = true
|
| 58 |
+
shuffle_seed = 3407
|
| 59 |
+
|
| 60 |
+
[logging]
|
| 61 |
+
backend = "wandb"
|
| 62 |
+
architecture = "DiTImageFlow"
|
| 63 |
+
dataset = "MNIST"
|
| 64 |
+
log_activation_norms = false
|
| 65 |
+
log_weight_norms = true
|
| 66 |
+
log_grad_norms = true
|
| 67 |
+
log_p_mask_bucket_loss = false
|
| 68 |
+
val_log_every = 0
|
| 69 |
+
val_log_samples = 0
|
| 70 |
+
|
| 71 |
+
[wandb]
|
| 72 |
+
entity = "yiltro8-org"
|
| 73 |
+
project = "mnist_flow"
|
| 74 |
+
|
| 75 |
+
[ddp]
|
| 76 |
+
backend = "nccl"
|
| 77 |
+
num_nodes = 1
|
| 78 |
+
num_gpus_per_node = 1
|
| 79 |
+
node_rank = 0
|
| 80 |
+
master_addr = "127.0.0.1"
|
| 81 |
+
master_port = "29500"
|
| 82 |
+
bucket_size_mb = 200
|
| 83 |
+
nccl_p2p_disable = true
|
| 84 |
+
|
| 85 |
+
[checkpointing]
|
| 86 |
+
enabled = true
|
| 87 |
+
ckpting_save_iter = 1000
|
| 88 |
+
resume_optimizer = true
|
| 89 |
+
best_metric_name = "val_loss"
|
| 90 |
+
best_mode = "min"
|
manifest.json
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"aliases": {
|
| 3 |
+
"best": {
|
| 4 |
+
"metric_name": "val_loss",
|
| 5 |
+
"mode": "min",
|
| 6 |
+
"status": "active",
|
| 7 |
+
"step": 7000,
|
| 8 |
+
"value": 0.15839263796806335,
|
| 9 |
+
"version_id": "v007000"
|
| 10 |
+
},
|
| 11 |
+
"latest": {
|
| 12 |
+
"step": 8000,
|
| 13 |
+
"version_id": "v008000"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"config": {
|
| 17 |
+
"bytes": 1727,
|
| 18 |
+
"key": "runs/2026-03-08_12-52-45/config/train.toml",
|
| 19 |
+
"sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
|
| 20 |
+
},
|
| 21 |
+
"created_at": "2026-03-08T12:52:48.972713Z",
|
| 22 |
+
"paths": {
|
| 23 |
+
"layout_version": 1,
|
| 24 |
+
"root_local": "runs/2026-03-08_12-52-45"
|
| 25 |
+
},
|
| 26 |
+
"run_id": "2026-03-08_12-52-45",
|
| 27 |
+
"schema_version": 1,
|
| 28 |
+
"versions": [
|
| 29 |
+
{
|
| 30 |
+
"created_at": "2026-03-08T13:15:30.591640Z",
|
| 31 |
+
"metrics": {
|
| 32 |
+
"val_loss": 0.19012750685214996
|
| 33 |
+
},
|
| 34 |
+
"model_key": "runs/2026-03-08_12-52-45/versions/v001000/model.safetensors",
|
| 35 |
+
"step": 1000,
|
| 36 |
+
"version_id": "v001000"
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"created_at": "2026-03-08T13:37:49.696556Z",
|
| 40 |
+
"metrics": {
|
| 41 |
+
"val_loss": 0.16816174983978271
|
| 42 |
+
},
|
| 43 |
+
"model_key": "runs/2026-03-08_12-52-45/versions/v002000/model.safetensors",
|
| 44 |
+
"step": 2000,
|
| 45 |
+
"version_id": "v002000"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"created_at": "2026-03-08T14:00:08.773296Z",
|
| 49 |
+
"metrics": {
|
| 50 |
+
"val_loss": 0.16450278460979462
|
| 51 |
+
},
|
| 52 |
+
"model_key": "runs/2026-03-08_12-52-45/versions/v003000/model.safetensors",
|
| 53 |
+
"step": 3000,
|
| 54 |
+
"version_id": "v003000"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"created_at": "2026-03-08T14:22:26.642793Z",
|
| 58 |
+
"metrics": {
|
| 59 |
+
"val_loss": 0.15943044424057007
|
| 60 |
+
},
|
| 61 |
+
"model_key": "runs/2026-03-08_12-52-45/versions/v004000/model.safetensors",
|
| 62 |
+
"step": 4000,
|
| 63 |
+
"version_id": "v004000"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"created_at": "2026-03-08T14:44:45.703082Z",
|
| 67 |
+
"metrics": {
|
| 68 |
+
"val_loss": 0.15903371572494507
|
| 69 |
+
},
|
| 70 |
+
"model_key": "runs/2026-03-08_12-52-45/versions/v005000/model.safetensors",
|
| 71 |
+
"step": 5000,
|
| 72 |
+
"version_id": "v005000"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"created_at": "2026-03-08T15:07:04.573166Z",
|
| 76 |
+
"metrics": {
|
| 77 |
+
"val_loss": 0.16576382517814636
|
| 78 |
+
},
|
| 79 |
+
"model_key": "runs/2026-03-08_12-52-45/versions/v006000/model.safetensors",
|
| 80 |
+
"step": 6000,
|
| 81 |
+
"version_id": "v006000"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"created_at": "2026-03-08T15:29:21.920990Z",
|
| 85 |
+
"metrics": {
|
| 86 |
+
"val_loss": 0.15839263796806335
|
| 87 |
+
},
|
| 88 |
+
"model_key": "runs/2026-03-08_12-52-45/versions/v007000/model.safetensors",
|
| 89 |
+
"step": 7000,
|
| 90 |
+
"version_id": "v007000"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"created_at": "2026-03-08T15:51:37.216665Z",
|
| 94 |
+
"metrics": {
|
| 95 |
+
"val_loss": 0.15889549255371094
|
| 96 |
+
},
|
| 97 |
+
"model_key": "runs/2026-03-08_12-52-45/versions/v008000/model.safetensors",
|
| 98 |
+
"step": 8000,
|
| 99 |
+
"version_id": "v008000"
|
| 100 |
+
}
|
| 101 |
+
]
|
| 102 |
+
}
|
versions/v001000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1727,
|
| 6 |
+
"key": "runs/2026-03-08_12-52-45/config/train.toml",
|
| 7 |
+
"sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-03-08T13:15:30.591640Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.19012750685214996
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42255608,
|
| 15 |
+
"key": "runs/2026-03-08_12-52-45/versions/v001000/model.safetensors",
|
| 16 |
+
"sha256": "a8fb58f5c19addc5a21b315e02add6f2d7fc2a64e3c054d0d4e4155fe5e88476"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84561723,
|
| 23 |
+
"key": "runs/2026-03-08_12-52-45/versions/v001000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "acfcefbfb530727eb5e00d94a223027711916ca65510ac25c7ac1c50a8c6e558"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-03-08_12-52-45"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 1001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-03-08_12-52-45/versions/v001000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-03-08_12-52-45",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 1000,
|
| 49 |
+
"version_id": "v001000"
|
| 50 |
+
}
|
versions/v001000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8fb58f5c19addc5a21b315e02add6f2d7fc2a64e3c054d0d4e4155fe5e88476
|
| 3 |
+
size 42255608
|
versions/v001000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:acfcefbfb530727eb5e00d94a223027711916ca65510ac25c7ac1c50a8c6e558
|
| 3 |
+
size 84561723
|
versions/v001000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v002000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1727,
|
| 6 |
+
"key": "runs/2026-03-08_12-52-45/config/train.toml",
|
| 7 |
+
"sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-03-08T13:37:49.696556Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.16816174983978271
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42255608,
|
| 15 |
+
"key": "runs/2026-03-08_12-52-45/versions/v002000/model.safetensors",
|
| 16 |
+
"sha256": "eb1b891ccd36e9f026e49e5fcdd7c18e3bb7bc1dadba8de9bef0cbafed54e752"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84561723,
|
| 23 |
+
"key": "runs/2026-03-08_12-52-45/versions/v002000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "3ed59bc8dc7eeb8fb4884027d41e109cf11994f2c6180d87fb786b2494cbfd02"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-03-08_12-52-45"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 2001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-03-08_12-52-45/versions/v002000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-03-08_12-52-45",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 2000,
|
| 49 |
+
"version_id": "v002000"
|
| 50 |
+
}
|
versions/v002000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb1b891ccd36e9f026e49e5fcdd7c18e3bb7bc1dadba8de9bef0cbafed54e752
|
| 3 |
+
size 42255608
|
versions/v002000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ed59bc8dc7eeb8fb4884027d41e109cf11994f2c6180d87fb786b2494cbfd02
|
| 3 |
+
size 84561723
|
versions/v002000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v003000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1727,
|
| 6 |
+
"key": "runs/2026-03-08_12-52-45/config/train.toml",
|
| 7 |
+
"sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-03-08T14:00:08.773296Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.16450278460979462
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42255608,
|
| 15 |
+
"key": "runs/2026-03-08_12-52-45/versions/v003000/model.safetensors",
|
| 16 |
+
"sha256": "02a48a588479301bdf67189ebd4a71501a12a8362a668deeb36392a5498c5757"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84561723,
|
| 23 |
+
"key": "runs/2026-03-08_12-52-45/versions/v003000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "1697a8a22f8da31b290bf28ca102d83db676ff8cd209f35821eb3592952b18c9"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-03-08_12-52-45"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 3001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-03-08_12-52-45/versions/v003000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-03-08_12-52-45",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 3000,
|
| 49 |
+
"version_id": "v003000"
|
| 50 |
+
}
|
versions/v003000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02a48a588479301bdf67189ebd4a71501a12a8362a668deeb36392a5498c5757
|
| 3 |
+
size 42255608
|
versions/v003000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1697a8a22f8da31b290bf28ca102d83db676ff8cd209f35821eb3592952b18c9
|
| 3 |
+
size 84561723
|
versions/v003000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v004000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1727,
|
| 6 |
+
"key": "runs/2026-03-08_12-52-45/config/train.toml",
|
| 7 |
+
"sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-03-08T14:22:26.642793Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.15943044424057007
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42255608,
|
| 15 |
+
"key": "runs/2026-03-08_12-52-45/versions/v004000/model.safetensors",
|
| 16 |
+
"sha256": "20fdc6efe3886dab442a9d6ce8ff340fca5450c06db0ebc2a0fec4c72527c299"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84561723,
|
| 23 |
+
"key": "runs/2026-03-08_12-52-45/versions/v004000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "b2ea7c4229e7f15fc8df06cc4aa4826568a902a19222fdae96d6809cccbb452c"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-03-08_12-52-45"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 4001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-03-08_12-52-45/versions/v004000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-03-08_12-52-45",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 4000,
|
| 49 |
+
"version_id": "v004000"
|
| 50 |
+
}
|
versions/v004000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:20fdc6efe3886dab442a9d6ce8ff340fca5450c06db0ebc2a0fec4c72527c299
|
| 3 |
+
size 42255608
|
versions/v004000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2ea7c4229e7f15fc8df06cc4aa4826568a902a19222fdae96d6809cccbb452c
|
| 3 |
+
size 84561723
|
versions/v004000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v005000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1727,
|
| 6 |
+
"key": "runs/2026-03-08_12-52-45/config/train.toml",
|
| 7 |
+
"sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-03-08T14:44:45.703082Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.15903371572494507
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42255608,
|
| 15 |
+
"key": "runs/2026-03-08_12-52-45/versions/v005000/model.safetensors",
|
| 16 |
+
"sha256": "28c6c1419029bde3d80261904b955f8f4b584c6d7553976020ed830dd524c4b0"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84561723,
|
| 23 |
+
"key": "runs/2026-03-08_12-52-45/versions/v005000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "2fd9c186811cb3c2e02687a8836b8c7210a7e63b2942cbb545c2206dff03d410"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-03-08_12-52-45"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 5001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-03-08_12-52-45/versions/v005000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-03-08_12-52-45",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 5000,
|
| 49 |
+
"version_id": "v005000"
|
| 50 |
+
}
|
versions/v005000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28c6c1419029bde3d80261904b955f8f4b584c6d7553976020ed830dd524c4b0
|
| 3 |
+
size 42255608
|
versions/v005000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2fd9c186811cb3c2e02687a8836b8c7210a7e63b2942cbb545c2206dff03d410
|
| 3 |
+
size 84561723
|
versions/v005000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v006000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1727,
|
| 6 |
+
"key": "runs/2026-03-08_12-52-45/config/train.toml",
|
| 7 |
+
"sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-03-08T15:07:04.573166Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.16576382517814636
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42255608,
|
| 15 |
+
"key": "runs/2026-03-08_12-52-45/versions/v006000/model.safetensors",
|
| 16 |
+
"sha256": "4a45329c6cf33a7859a7b5176abe1e9ddd93b2808bcb8ef904dfa8d9da2a0628"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84561723,
|
| 23 |
+
"key": "runs/2026-03-08_12-52-45/versions/v006000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "230764e6ed078a33d9020095686d1ee471abce777071c68b30c18f800eecfe29"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-03-08_12-52-45"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 6001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-03-08_12-52-45/versions/v006000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-03-08_12-52-45",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 6000,
|
| 49 |
+
"version_id": "v006000"
|
| 50 |
+
}
|
versions/v006000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a45329c6cf33a7859a7b5176abe1e9ddd93b2808bcb8ef904dfa8d9da2a0628
|
| 3 |
+
size 42255608
|
versions/v006000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:230764e6ed078a33d9020095686d1ee471abce777071c68b30c18f800eecfe29
|
| 3 |
+
size 84561723
|
versions/v006000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v007000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1727,
|
| 6 |
+
"key": "runs/2026-03-08_12-52-45/config/train.toml",
|
| 7 |
+
"sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-03-08T15:29:21.920990Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.15839263796806335
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42255608,
|
| 15 |
+
"key": "runs/2026-03-08_12-52-45/versions/v007000/model.safetensors",
|
| 16 |
+
"sha256": "a2aab911a7a6732e696b6dcb65909f1dbfacc4575157bb083d9700d2be3cfce1"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84561723,
|
| 23 |
+
"key": "runs/2026-03-08_12-52-45/versions/v007000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "7832cd4f8a3e30dba3ab050f90b1deb18334fa75bd53d90412f1447ad36898e8"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-03-08_12-52-45"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 7001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-03-08_12-52-45/versions/v007000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-03-08_12-52-45",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 7000,
|
| 49 |
+
"version_id": "v007000"
|
| 50 |
+
}
|
versions/v007000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2aab911a7a6732e696b6dcb65909f1dbfacc4575157bb083d9700d2be3cfce1
|
| 3 |
+
size 42255608
|
versions/v007000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7832cd4f8a3e30dba3ab050f90b1deb18334fa75bd53d90412f1447ad36898e8
|
| 3 |
+
size 84561723
|
versions/v007000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
versions/v008000/manifest.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"amp_scaler": null,
|
| 3 |
+
"code": {},
|
| 4 |
+
"config": {
|
| 5 |
+
"bytes": 1727,
|
| 6 |
+
"key": "runs/2026-03-08_12-52-45/config/train.toml",
|
| 7 |
+
"sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
|
| 8 |
+
},
|
| 9 |
+
"created_at": "2026-03-08T15:51:37.216665Z",
|
| 10 |
+
"metrics": {
|
| 11 |
+
"val_loss": 0.15889549255371094
|
| 12 |
+
},
|
| 13 |
+
"model": {
|
| 14 |
+
"bytes": 42255608,
|
| 15 |
+
"key": "runs/2026-03-08_12-52-45/versions/v008000/model.safetensors",
|
| 16 |
+
"sha256": "7930eda42795bcc2cabb42e7ce6a44a4a20a409289cfd18a95daab2bf4ead9b3"
|
| 17 |
+
},
|
| 18 |
+
"optimizer": {
|
| 19 |
+
"sharding": "custom",
|
| 20 |
+
"shards": [
|
| 21 |
+
{
|
| 22 |
+
"bytes": 84561723,
|
| 23 |
+
"key": "runs/2026-03-08_12-52-45/versions/v008000/opt_shard_rank0000.bin",
|
| 24 |
+
"rank": 0,
|
| 25 |
+
"sha256": "c6a78c31cdcbdf6eb19de87c477c3cd40c50cbd29222e0eaef1014f5895d249b"
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"paths": {
|
| 30 |
+
"layout_version": 1,
|
| 31 |
+
"root_local": "runs/2026-03-08_12-52-45"
|
| 32 |
+
},
|
| 33 |
+
"resume": {
|
| 34 |
+
"base_step": 8001,
|
| 35 |
+
"exact": true
|
| 36 |
+
},
|
| 37 |
+
"rng": {
|
| 38 |
+
"keys": [
|
| 39 |
+
{
|
| 40 |
+
"key": "runs/2026-03-08_12-52-45/versions/v008000/rng_rank0000.json",
|
| 41 |
+
"rank": 0
|
| 42 |
+
}
|
| 43 |
+
],
|
| 44 |
+
"per_rank": true
|
| 45 |
+
},
|
| 46 |
+
"run_id": "2026-03-08_12-52-45",
|
| 47 |
+
"schema_version": 1,
|
| 48 |
+
"step": 8000,
|
| 49 |
+
"version_id": "v008000"
|
| 50 |
+
}
|
versions/v008000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7930eda42795bcc2cabb42e7ce6a44a4a20a409289cfd18a95daab2bf4ead9b3
|
| 3 |
+
size 42255608
|
versions/v008000/opt_shard_rank0000.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6a78c31cdcbdf6eb19de87c477c3cd40c50cbd29222e0eaef1014f5895d249b
|
| 3 |
+
size 84561723
|
versions/v008000/rng_rank0000.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|