trixyL commited on
Commit
d51cf33
·
1 Parent(s): 038f351

dump: training run

Browse files
README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - ylecun/mnist
5
+ language:
6
+ - en
7
+ tags:
8
+ - mnist
9
+ - '784'
10
+ - '256'
11
+ - transformerlm
12
+ - flow-matching
13
+ - dit
14
+ ---
15
+ # 🧠🌊 TransformerLM (Flow 784, 256) — MNIST
16
+
17
+ Training run artifacts from https://github.com/triloy8/transformerlm: a minimal flow-matching **DiT-style** image model trained on **MNIST** with a **fixed 784-token context** (28×28 image values) and **conditional generation** using discrete labels plus a null label for classifier-free guidance (CFG).
18
+
19
+ ## ✅ Key Facts
20
+
21
+ - **Model type:** `image_dit` flow-matching Transformer
22
+ - **Objective:** Flow matching
23
+ - **Dataset:** MNIST (full 8-bit pixel values, 256 levels)
24
+ - **Context length:** 784 values (28×28 image)
25
+ - **Layers:** 8
26
+ - **Heads:** 16
27
+ - **d_model:** 256
28
+ - **d_ff:** 1024
29
+ - **Training setup:** Single NVIDIA A40 (48GB)
30
+ - **Runtime:** ~3 hours ⏱️
31
+
32
+ ## 📦 What’s Inside
33
+
34
+ - 8k steps (full run), including:
35
+ - Optimizer state
36
+ - RNG state
37
+ - Safetensors weights
38
+ - Run config
39
+ - Best checkpoint alias (`v007000`)
40
+ - Latest checkpoint alias (`v008000`)
41
+
42
+ ## 🚀 Reproducibility
43
+
44
+ Exact run dumped from:
45
+ `https://github.com/triloy8/transformerlm/commit/01459662f08e83abc997966415d648563860859e`
aliases/best.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alias": "best",
3
+ "manifest_key": "runs/2026-03-08_12-52-45/versions/v007000/manifest.json",
4
+ "metric_name": "val_loss",
5
+ "mode": "min",
6
+ "run_id": "2026-03-08_12-52-45",
7
+ "schema_version": 1,
8
+ "status": "active",
9
+ "step": 7000,
10
+ "value": 0.15839263796806335,
11
+ "version_id": "v007000"
12
+ }
aliases/latest.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alias": "latest",
3
+ "manifest_key": "runs/2026-03-08_12-52-45/versions/v008000/manifest.json",
4
+ "run_id": "2026-03-08_12-52-45",
5
+ "schema_version": 1,
6
+ "status": "active",
7
+ "step": 8000,
8
+ "version_id": "v008000"
9
+ }
config/config.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "checkpointing": {
3
+ "best_metric_name": "val_loss",
4
+ "best_mode": "min",
5
+ "ckpting_save_iter": 1000,
6
+ "enabled": true,
7
+ "resume_from": null,
8
+ "resume_optimizer": true,
9
+ "run_id": null
10
+ },
11
+ "compile": null,
12
+ "data": {
13
+ "cache_all": true,
14
+ "dataset_config": null,
15
+ "dataset_name": "ylecun/mnist",
16
+ "megatron_train_prefix": null,
17
+ "megatron_val_prefix": null,
18
+ "pad_random_shift": false,
19
+ "pad_token_id": null,
20
+ "pipeline_mode": "mnist",
21
+ "runs_path": "runs",
22
+ "shuffle_buffer_size": 0,
23
+ "shuffle_seed": 3407,
24
+ "text_field": "image",
25
+ "tokenizer": null,
26
+ "train_split": "train",
27
+ "val_split": "test"
28
+ },
29
+ "ddp": {
30
+ "backend": "nccl",
31
+ "bucket_size_mb": 200,
32
+ "master_addr": "127.0.0.1",
33
+ "master_port": "29500",
34
+ "nccl_p2p_disable": true,
35
+ "node_rank": 0,
36
+ "num_gpus_per_node": 1,
37
+ "num_nodes": 1
38
+ },
39
+ "logging": {
40
+ "architecture": "DiTImageFlow",
41
+ "backend": "wandb",
42
+ "dataset": "MNIST",
43
+ "log_activation_norms": false,
44
+ "log_grad_norms": true,
45
+ "log_p_mask_bucket_loss": false,
46
+ "log_weight_norms": true,
47
+ "p_mask_bucket_edges": null,
48
+ "run_name": null,
49
+ "val_log_every": 0,
50
+ "val_log_samples": 0
51
+ },
52
+ "model": {
53
+ "attention_backend": "torch_sdpa",
54
+ "attention_sdp_backend": "auto",
55
+ "context_length": 784,
56
+ "d_ff": 1024,
57
+ "d_model": 256,
58
+ "device": "cuda",
59
+ "dtype": "float32",
60
+ "eot_token_id": null,
61
+ "image_height": 28,
62
+ "image_width": 28,
63
+ "label_vocab_size": 11,
64
+ "mask_token_id": null,
65
+ "model_type": "image_dit",
66
+ "noise_epsilon": 0.001,
67
+ "null_label_id": 10,
68
+ "num_heads": 16,
69
+ "num_layers": 8,
70
+ "pixel_bins": 256,
71
+ "random_trunc_prob": 0.0,
72
+ "rope_theta": 10000.0,
73
+ "use_rope_2d": true,
74
+ "vocab_size": 257
75
+ },
76
+ "optimizer": {
77
+ "betas": [
78
+ 0.9,
79
+ 0.95
80
+ ],
81
+ "cosine_cycle_iters": 60000,
82
+ "eps": 1e-08,
83
+ "grad_clip_max_l2_norm": 3.0,
84
+ "initial_learning_rate": 0.0001,
85
+ "lr_schedule": "constant_with_warmup",
86
+ "max_learning_rate": 0.003,
87
+ "min_learning_rate": 0.0003,
88
+ "muon": null,
89
+ "optimizer_name": "adamw",
90
+ "warmup_iters": 200,
91
+ "weight_decay": 0.1
92
+ },
93
+ "train_infer": null,
94
+ "training": {
95
+ "amp_dtype": "bfloat16",
96
+ "amp_enabled": true,
97
+ "batch_size": 256,
98
+ "deterministic_mask": false,
99
+ "eot_mask_loss": false,
100
+ "grad_accum_steps": 1,
101
+ "joint_alpha_schedule": "constant",
102
+ "joint_alpha_schedule_end": 1.0,
103
+ "joint_alpha_schedule_start": 0.0,
104
+ "joint_diffusion_alpha": 0.3,
105
+ "joint_diffusion_alpha_end": null,
106
+ "max_train_iteration": 120000,
107
+ "max_val_iteration": 10,
108
+ "objective": "flow",
109
+ "p_mask_end": null,
110
+ "p_mask_override": null,
111
+ "p_mask_schedule": "none",
112
+ "p_mask_schedule_end": 1.0,
113
+ "p_mask_schedule_start": 0.0,
114
+ "p_mask_start": null,
115
+ "repeat_masking_seed": null,
116
+ "seed": 3407,
117
+ "skip_validation": false,
118
+ "train_loss_ema_decay": 0.99,
119
+ "uncond_label_dropout_prob": 0.1,
120
+ "val_freq_iteration": 250
121
+ },
122
+ "wandb": {
123
+ "architecture": null,
124
+ "dataset": null,
125
+ "entity": "yiltro8-org",
126
+ "project": "mnist_flow"
127
+ }
128
+ }
config/train.toml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [model]
2
+ model_type = "image_dit"
3
+ label_vocab_size = 11
4
+ vocab_size = 257
5
+ pixel_bins = 256
6
+ context_length = 784
7
+ d_model = 256
8
+ num_layers = 8
9
+ num_heads = 16
10
+ d_ff = 1024
11
+ rope_theta = 10000.0
12
+ attention_backend = "torch_sdpa"
13
+ attention_sdp_backend = "auto"
14
+ device = "cuda"
15
+ dtype = "float32"
16
+ null_label_id = 10
17
+ random_trunc_prob = 0.0
18
+ use_rope_2d = true
19
+ image_height = 28
20
+ image_width = 28
21
+
22
+ [optimizer]
23
+ optimizer_name = "adamw"
24
+ betas = [0.9, 0.95]
25
+ eps = 1e-8
26
+ weight_decay = 0.1
27
+ initial_learning_rate = 0.0001
28
+ max_learning_rate = 0.003
29
+ min_learning_rate = 0.0003
30
+ warmup_iters = 200
31
+ cosine_cycle_iters = 60000
32
+ grad_clip_max_l2_norm = 3.0
33
+ lr_schedule = "constant_with_warmup"
34
+
35
+ [training]
36
+ batch_size = 256
37
+ max_train_iteration = 120000
38
+ max_val_iteration = 10
39
+ val_freq_iteration = 250
40
+ seed = 3407
41
+ skip_validation = false
42
+ grad_accum_steps = 1
43
+ train_loss_ema_decay = 0.99
44
+ amp_enabled = true
45
+ amp_dtype = "bfloat16"
46
+ objective = "flow"
47
+ uncond_label_dropout_prob = 0.1
48
+
49
+ [data]
50
+ runs_path = "./runs"
51
+ dataset_name = "ylecun/mnist"
52
+ train_split = "train"
53
+ val_split = "test"
54
+ text_field = "image"
55
+ pipeline_mode = "mnist"
56
+ shuffle_buffer_size = 0
57
+ cache_all = true
58
+ shuffle_seed = 3407
59
+
60
+ [logging]
61
+ backend = "wandb"
62
+ architecture = "DiTImageFlow"
63
+ dataset = "MNIST"
64
+ log_activation_norms = false
65
+ log_weight_norms = true
66
+ log_grad_norms = true
67
+ log_p_mask_bucket_loss = false
68
+ val_log_every = 0
69
+ val_log_samples = 0
70
+
71
+ [wandb]
72
+ entity = "yiltro8-org"
73
+ project = "mnist_flow"
74
+
75
+ [ddp]
76
+ backend = "nccl"
77
+ num_nodes = 1
78
+ num_gpus_per_node = 1
79
+ node_rank = 0
80
+ master_addr = "127.0.0.1"
81
+ master_port = "29500"
82
+ bucket_size_mb = 200
83
+ nccl_p2p_disable = true
84
+
85
+ [checkpointing]
86
+ enabled = true
87
+ ckpting_save_iter = 1000
88
+ resume_optimizer = true
89
+ best_metric_name = "val_loss"
90
+ best_mode = "min"
manifest.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aliases": {
3
+ "best": {
4
+ "metric_name": "val_loss",
5
+ "mode": "min",
6
+ "status": "active",
7
+ "step": 7000,
8
+ "value": 0.15839263796806335,
9
+ "version_id": "v007000"
10
+ },
11
+ "latest": {
12
+ "step": 8000,
13
+ "version_id": "v008000"
14
+ }
15
+ },
16
+ "config": {
17
+ "bytes": 1727,
18
+ "key": "runs/2026-03-08_12-52-45/config/train.toml",
19
+ "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
20
+ },
21
+ "created_at": "2026-03-08T12:52:48.972713Z",
22
+ "paths": {
23
+ "layout_version": 1,
24
+ "root_local": "runs/2026-03-08_12-52-45"
25
+ },
26
+ "run_id": "2026-03-08_12-52-45",
27
+ "schema_version": 1,
28
+ "versions": [
29
+ {
30
+ "created_at": "2026-03-08T13:15:30.591640Z",
31
+ "metrics": {
32
+ "val_loss": 0.19012750685214996
33
+ },
34
+ "model_key": "runs/2026-03-08_12-52-45/versions/v001000/model.safetensors",
35
+ "step": 1000,
36
+ "version_id": "v001000"
37
+ },
38
+ {
39
+ "created_at": "2026-03-08T13:37:49.696556Z",
40
+ "metrics": {
41
+ "val_loss": 0.16816174983978271
42
+ },
43
+ "model_key": "runs/2026-03-08_12-52-45/versions/v002000/model.safetensors",
44
+ "step": 2000,
45
+ "version_id": "v002000"
46
+ },
47
+ {
48
+ "created_at": "2026-03-08T14:00:08.773296Z",
49
+ "metrics": {
50
+ "val_loss": 0.16450278460979462
51
+ },
52
+ "model_key": "runs/2026-03-08_12-52-45/versions/v003000/model.safetensors",
53
+ "step": 3000,
54
+ "version_id": "v003000"
55
+ },
56
+ {
57
+ "created_at": "2026-03-08T14:22:26.642793Z",
58
+ "metrics": {
59
+ "val_loss": 0.15943044424057007
60
+ },
61
+ "model_key": "runs/2026-03-08_12-52-45/versions/v004000/model.safetensors",
62
+ "step": 4000,
63
+ "version_id": "v004000"
64
+ },
65
+ {
66
+ "created_at": "2026-03-08T14:44:45.703082Z",
67
+ "metrics": {
68
+ "val_loss": 0.15903371572494507
69
+ },
70
+ "model_key": "runs/2026-03-08_12-52-45/versions/v005000/model.safetensors",
71
+ "step": 5000,
72
+ "version_id": "v005000"
73
+ },
74
+ {
75
+ "created_at": "2026-03-08T15:07:04.573166Z",
76
+ "metrics": {
77
+ "val_loss": 0.16576382517814636
78
+ },
79
+ "model_key": "runs/2026-03-08_12-52-45/versions/v006000/model.safetensors",
80
+ "step": 6000,
81
+ "version_id": "v006000"
82
+ },
83
+ {
84
+ "created_at": "2026-03-08T15:29:21.920990Z",
85
+ "metrics": {
86
+ "val_loss": 0.15839263796806335
87
+ },
88
+ "model_key": "runs/2026-03-08_12-52-45/versions/v007000/model.safetensors",
89
+ "step": 7000,
90
+ "version_id": "v007000"
91
+ },
92
+ {
93
+ "created_at": "2026-03-08T15:51:37.216665Z",
94
+ "metrics": {
95
+ "val_loss": 0.15889549255371094
96
+ },
97
+ "model_key": "runs/2026-03-08_12-52-45/versions/v008000/model.safetensors",
98
+ "step": 8000,
99
+ "version_id": "v008000"
100
+ }
101
+ ]
102
+ }
versions/v001000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1727,
6
+ "key": "runs/2026-03-08_12-52-45/config/train.toml",
7
+ "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
8
+ },
9
+ "created_at": "2026-03-08T13:15:30.591640Z",
10
+ "metrics": {
11
+ "val_loss": 0.19012750685214996
12
+ },
13
+ "model": {
14
+ "bytes": 42255608,
15
+ "key": "runs/2026-03-08_12-52-45/versions/v001000/model.safetensors",
16
+ "sha256": "a8fb58f5c19addc5a21b315e02add6f2d7fc2a64e3c054d0d4e4155fe5e88476"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84561723,
23
+ "key": "runs/2026-03-08_12-52-45/versions/v001000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "acfcefbfb530727eb5e00d94a223027711916ca65510ac25c7ac1c50a8c6e558"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-03-08_12-52-45"
32
+ },
33
+ "resume": {
34
+ "base_step": 1001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-03-08_12-52-45/versions/v001000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-03-08_12-52-45",
47
+ "schema_version": 1,
48
+ "step": 1000,
49
+ "version_id": "v001000"
50
+ }
versions/v001000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8fb58f5c19addc5a21b315e02add6f2d7fc2a64e3c054d0d4e4155fe5e88476
3
+ size 42255608
versions/v001000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acfcefbfb530727eb5e00d94a223027711916ca65510ac25c7ac1c50a8c6e558
3
+ size 84561723
versions/v001000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v002000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1727,
6
+ "key": "runs/2026-03-08_12-52-45/config/train.toml",
7
+ "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
8
+ },
9
+ "created_at": "2026-03-08T13:37:49.696556Z",
10
+ "metrics": {
11
+ "val_loss": 0.16816174983978271
12
+ },
13
+ "model": {
14
+ "bytes": 42255608,
15
+ "key": "runs/2026-03-08_12-52-45/versions/v002000/model.safetensors",
16
+ "sha256": "eb1b891ccd36e9f026e49e5fcdd7c18e3bb7bc1dadba8de9bef0cbafed54e752"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84561723,
23
+ "key": "runs/2026-03-08_12-52-45/versions/v002000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "3ed59bc8dc7eeb8fb4884027d41e109cf11994f2c6180d87fb786b2494cbfd02"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-03-08_12-52-45"
32
+ },
33
+ "resume": {
34
+ "base_step": 2001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-03-08_12-52-45/versions/v002000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-03-08_12-52-45",
47
+ "schema_version": 1,
48
+ "step": 2000,
49
+ "version_id": "v002000"
50
+ }
versions/v002000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb1b891ccd36e9f026e49e5fcdd7c18e3bb7bc1dadba8de9bef0cbafed54e752
3
+ size 42255608
versions/v002000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ed59bc8dc7eeb8fb4884027d41e109cf11994f2c6180d87fb786b2494cbfd02
3
+ size 84561723
versions/v002000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v003000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1727,
6
+ "key": "runs/2026-03-08_12-52-45/config/train.toml",
7
+ "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
8
+ },
9
+ "created_at": "2026-03-08T14:00:08.773296Z",
10
+ "metrics": {
11
+ "val_loss": 0.16450278460979462
12
+ },
13
+ "model": {
14
+ "bytes": 42255608,
15
+ "key": "runs/2026-03-08_12-52-45/versions/v003000/model.safetensors",
16
+ "sha256": "02a48a588479301bdf67189ebd4a71501a12a8362a668deeb36392a5498c5757"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84561723,
23
+ "key": "runs/2026-03-08_12-52-45/versions/v003000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "1697a8a22f8da31b290bf28ca102d83db676ff8cd209f35821eb3592952b18c9"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-03-08_12-52-45"
32
+ },
33
+ "resume": {
34
+ "base_step": 3001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-03-08_12-52-45/versions/v003000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-03-08_12-52-45",
47
+ "schema_version": 1,
48
+ "step": 3000,
49
+ "version_id": "v003000"
50
+ }
versions/v003000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02a48a588479301bdf67189ebd4a71501a12a8362a668deeb36392a5498c5757
3
+ size 42255608
versions/v003000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1697a8a22f8da31b290bf28ca102d83db676ff8cd209f35821eb3592952b18c9
3
+ size 84561723
versions/v003000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v004000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1727,
6
+ "key": "runs/2026-03-08_12-52-45/config/train.toml",
7
+ "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
8
+ },
9
+ "created_at": "2026-03-08T14:22:26.642793Z",
10
+ "metrics": {
11
+ "val_loss": 0.15943044424057007
12
+ },
13
+ "model": {
14
+ "bytes": 42255608,
15
+ "key": "runs/2026-03-08_12-52-45/versions/v004000/model.safetensors",
16
+ "sha256": "20fdc6efe3886dab442a9d6ce8ff340fca5450c06db0ebc2a0fec4c72527c299"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84561723,
23
+ "key": "runs/2026-03-08_12-52-45/versions/v004000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "b2ea7c4229e7f15fc8df06cc4aa4826568a902a19222fdae96d6809cccbb452c"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-03-08_12-52-45"
32
+ },
33
+ "resume": {
34
+ "base_step": 4001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-03-08_12-52-45/versions/v004000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-03-08_12-52-45",
47
+ "schema_version": 1,
48
+ "step": 4000,
49
+ "version_id": "v004000"
50
+ }
versions/v004000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20fdc6efe3886dab442a9d6ce8ff340fca5450c06db0ebc2a0fec4c72527c299
3
+ size 42255608
versions/v004000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2ea7c4229e7f15fc8df06cc4aa4826568a902a19222fdae96d6809cccbb452c
3
+ size 84561723
versions/v004000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v005000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1727,
6
+ "key": "runs/2026-03-08_12-52-45/config/train.toml",
7
+ "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
8
+ },
9
+ "created_at": "2026-03-08T14:44:45.703082Z",
10
+ "metrics": {
11
+ "val_loss": 0.15903371572494507
12
+ },
13
+ "model": {
14
+ "bytes": 42255608,
15
+ "key": "runs/2026-03-08_12-52-45/versions/v005000/model.safetensors",
16
+ "sha256": "28c6c1419029bde3d80261904b955f8f4b584c6d7553976020ed830dd524c4b0"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84561723,
23
+ "key": "runs/2026-03-08_12-52-45/versions/v005000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "2fd9c186811cb3c2e02687a8836b8c7210a7e63b2942cbb545c2206dff03d410"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-03-08_12-52-45"
32
+ },
33
+ "resume": {
34
+ "base_step": 5001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-03-08_12-52-45/versions/v005000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-03-08_12-52-45",
47
+ "schema_version": 1,
48
+ "step": 5000,
49
+ "version_id": "v005000"
50
+ }
versions/v005000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28c6c1419029bde3d80261904b955f8f4b584c6d7553976020ed830dd524c4b0
3
+ size 42255608
versions/v005000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fd9c186811cb3c2e02687a8836b8c7210a7e63b2942cbb545c2206dff03d410
3
+ size 84561723
versions/v005000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v006000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1727,
6
+ "key": "runs/2026-03-08_12-52-45/config/train.toml",
7
+ "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
8
+ },
9
+ "created_at": "2026-03-08T15:07:04.573166Z",
10
+ "metrics": {
11
+ "val_loss": 0.16576382517814636
12
+ },
13
+ "model": {
14
+ "bytes": 42255608,
15
+ "key": "runs/2026-03-08_12-52-45/versions/v006000/model.safetensors",
16
+ "sha256": "4a45329c6cf33a7859a7b5176abe1e9ddd93b2808bcb8ef904dfa8d9da2a0628"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84561723,
23
+ "key": "runs/2026-03-08_12-52-45/versions/v006000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "230764e6ed078a33d9020095686d1ee471abce777071c68b30c18f800eecfe29"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-03-08_12-52-45"
32
+ },
33
+ "resume": {
34
+ "base_step": 6001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-03-08_12-52-45/versions/v006000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-03-08_12-52-45",
47
+ "schema_version": 1,
48
+ "step": 6000,
49
+ "version_id": "v006000"
50
+ }
versions/v006000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a45329c6cf33a7859a7b5176abe1e9ddd93b2808bcb8ef904dfa8d9da2a0628
3
+ size 42255608
versions/v006000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:230764e6ed078a33d9020095686d1ee471abce777071c68b30c18f800eecfe29
3
+ size 84561723
versions/v006000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v007000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1727,
6
+ "key": "runs/2026-03-08_12-52-45/config/train.toml",
7
+ "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
8
+ },
9
+ "created_at": "2026-03-08T15:29:21.920990Z",
10
+ "metrics": {
11
+ "val_loss": 0.15839263796806335
12
+ },
13
+ "model": {
14
+ "bytes": 42255608,
15
+ "key": "runs/2026-03-08_12-52-45/versions/v007000/model.safetensors",
16
+ "sha256": "a2aab911a7a6732e696b6dcb65909f1dbfacc4575157bb083d9700d2be3cfce1"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84561723,
23
+ "key": "runs/2026-03-08_12-52-45/versions/v007000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "7832cd4f8a3e30dba3ab050f90b1deb18334fa75bd53d90412f1447ad36898e8"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-03-08_12-52-45"
32
+ },
33
+ "resume": {
34
+ "base_step": 7001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-03-08_12-52-45/versions/v007000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-03-08_12-52-45",
47
+ "schema_version": 1,
48
+ "step": 7000,
49
+ "version_id": "v007000"
50
+ }
versions/v007000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2aab911a7a6732e696b6dcb65909f1dbfacc4575157bb083d9700d2be3cfce1
3
+ size 42255608
versions/v007000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7832cd4f8a3e30dba3ab050f90b1deb18334fa75bd53d90412f1447ad36898e8
3
+ size 84561723
versions/v007000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff
 
versions/v008000/manifest.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "amp_scaler": null,
3
+ "code": {},
4
+ "config": {
5
+ "bytes": 1727,
6
+ "key": "runs/2026-03-08_12-52-45/config/train.toml",
7
+ "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
8
+ },
9
+ "created_at": "2026-03-08T15:51:37.216665Z",
10
+ "metrics": {
11
+ "val_loss": 0.15889549255371094
12
+ },
13
+ "model": {
14
+ "bytes": 42255608,
15
+ "key": "runs/2026-03-08_12-52-45/versions/v008000/model.safetensors",
16
+ "sha256": "7930eda42795bcc2cabb42e7ce6a44a4a20a409289cfd18a95daab2bf4ead9b3"
17
+ },
18
+ "optimizer": {
19
+ "sharding": "custom",
20
+ "shards": [
21
+ {
22
+ "bytes": 84561723,
23
+ "key": "runs/2026-03-08_12-52-45/versions/v008000/opt_shard_rank0000.bin",
24
+ "rank": 0,
25
+ "sha256": "c6a78c31cdcbdf6eb19de87c477c3cd40c50cbd29222e0eaef1014f5895d249b"
26
+ }
27
+ ]
28
+ },
29
+ "paths": {
30
+ "layout_version": 1,
31
+ "root_local": "runs/2026-03-08_12-52-45"
32
+ },
33
+ "resume": {
34
+ "base_step": 8001,
35
+ "exact": true
36
+ },
37
+ "rng": {
38
+ "keys": [
39
+ {
40
+ "key": "runs/2026-03-08_12-52-45/versions/v008000/rng_rank0000.json",
41
+ "rank": 0
42
+ }
43
+ ],
44
+ "per_rank": true
45
+ },
46
+ "run_id": "2026-03-08_12-52-45",
47
+ "schema_version": 1,
48
+ "step": 8000,
49
+ "version_id": "v008000"
50
+ }
versions/v008000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7930eda42795bcc2cabb42e7ce6a44a4a20a409289cfd18a95daab2bf4ead9b3
3
+ size 42255608
versions/v008000/opt_shard_rank0000.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6a78c31cdcbdf6eb19de87c477c3cd40c50cbd29222e0eaef1014f5895d249b
3
+ size 84561723
versions/v008000/rng_rank0000.json ADDED
The diff for this file is too large to render. See raw diff