trixyL commited on Mar 8

Commit

d51cf33

1 Parent(s): 038f351

dump: training run

Browse files

Files changed (38) hide show

README.md +45 -0
aliases/best.json +12 -0
aliases/latest.json +9 -0
config/config.json +128 -0
config/train.toml +90 -0
manifest.json +102 -0
versions/v001000/manifest.json +50 -0
versions/v001000/model.safetensors +3 -0
versions/v001000/opt_shard_rank0000.bin +3 -0
versions/v001000/rng_rank0000.json +0 -0
versions/v002000/manifest.json +50 -0
versions/v002000/model.safetensors +3 -0
versions/v002000/opt_shard_rank0000.bin +3 -0
versions/v002000/rng_rank0000.json +0 -0
versions/v003000/manifest.json +50 -0
versions/v003000/model.safetensors +3 -0
versions/v003000/opt_shard_rank0000.bin +3 -0
versions/v003000/rng_rank0000.json +0 -0
versions/v004000/manifest.json +50 -0
versions/v004000/model.safetensors +3 -0
versions/v004000/opt_shard_rank0000.bin +3 -0
versions/v004000/rng_rank0000.json +0 -0
versions/v005000/manifest.json +50 -0
versions/v005000/model.safetensors +3 -0
versions/v005000/opt_shard_rank0000.bin +3 -0
versions/v005000/rng_rank0000.json +0 -0
versions/v006000/manifest.json +50 -0
versions/v006000/model.safetensors +3 -0
versions/v006000/opt_shard_rank0000.bin +3 -0
versions/v006000/rng_rank0000.json +0 -0
versions/v007000/manifest.json +50 -0
versions/v007000/model.safetensors +3 -0
versions/v007000/opt_shard_rank0000.bin +3 -0
versions/v007000/rng_rank0000.json +0 -0
versions/v008000/manifest.json +50 -0
versions/v008000/model.safetensors +3 -0
versions/v008000/opt_shard_rank0000.bin +3 -0
versions/v008000/rng_rank0000.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,45 @@

+---
+license: apache-2.0
+datasets:
+- ylecun/mnist
+language:
+- en
+tags:
+- mnist
+- '784'
+- '256'
+- transformerlm
+- flow-matching
+- dit
+---
+# 🧠🌊 TransformerLM (Flow 784, 256) — MNIST
+Training run artifacts from https://github.com/triloy8/transformerlm: a minimal flow-matching **DiT-style** image model trained on **MNIST** with a **fixed 784-token context** (28×28 image values) and **conditional generation** using discrete labels plus a null label for classifier-free guidance (CFG).
+## ✅ Key Facts
+- **Model type:** `image_dit` flow-matching Transformer
+- **Objective:** Flow matching
+- **Dataset:** MNIST (full 8-bit pixel values, 256 levels)
+- **Context length:** 784 values (28×28 image)
+- **Layers:** 8
+- **Heads:** 16
+- **d_model:** 256
+- **d_ff:** 1024
+- **Training setup:** Single NVIDIA A40 (48GB)
+- **Runtime:** ~3 hours ⏱️
+## 📦 What’s Inside
+- 8k steps (full run), including:
+  - Optimizer state
+  - RNG state
+  - Safetensors weights
+- Run config
+- Best checkpoint alias (`v007000`)
+- Latest checkpoint alias (`v008000`)
+## 🚀 Reproducibility
+Exact run dumped from:
+`https://github.com/triloy8/transformerlm/commit/01459662f08e83abc997966415d648563860859e`

aliases/best.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "alias": "best",
+  "manifest_key": "runs/2026-03-08_12-52-45/versions/v007000/manifest.json",
+  "metric_name": "val_loss",
+  "mode": "min",
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "status": "active",
+  "step": 7000,
+  "value": 0.15839263796806335,
+  "version_id": "v007000"
+}

aliases/latest.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "alias": "latest",
+  "manifest_key": "runs/2026-03-08_12-52-45/versions/v008000/manifest.json",
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "status": "active",
+  "step": 8000,
+  "version_id": "v008000"
+}

config/config.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "checkpointing": {
+    "best_metric_name": "val_loss",
+    "best_mode": "min",
+    "ckpting_save_iter": 1000,
+    "enabled": true,
+    "resume_from": null,
+    "resume_optimizer": true,
+    "run_id": null
+  },
+  "compile": null,
+  "data": {
+    "cache_all": true,
+    "dataset_config": null,
+    "dataset_name": "ylecun/mnist",
+    "megatron_train_prefix": null,
+    "megatron_val_prefix": null,
+    "pad_random_shift": false,
+    "pad_token_id": null,
+    "pipeline_mode": "mnist",
+    "runs_path": "runs",
+    "shuffle_buffer_size": 0,
+    "shuffle_seed": 3407,
+    "text_field": "image",
+    "tokenizer": null,
+    "train_split": "train",
+    "val_split": "test"
+  },
+  "ddp": {
+    "backend": "nccl",
+    "bucket_size_mb": 200,
+    "master_addr": "127.0.0.1",
+    "master_port": "29500",
+    "nccl_p2p_disable": true,
+    "node_rank": 0,
+    "num_gpus_per_node": 1,
+    "num_nodes": 1
+  },
+  "logging": {
+    "architecture": "DiTImageFlow",
+    "backend": "wandb",
+    "dataset": "MNIST",
+    "log_activation_norms": false,
+    "log_grad_norms": true,
+    "log_p_mask_bucket_loss": false,
+    "log_weight_norms": true,
+    "p_mask_bucket_edges": null,
+    "run_name": null,
+    "val_log_every": 0,
+    "val_log_samples": 0
+  },
+  "model": {
+    "attention_backend": "torch_sdpa",
+    "attention_sdp_backend": "auto",
+    "context_length": 784,
+    "d_ff": 1024,
+    "d_model": 256,
+    "device": "cuda",
+    "dtype": "float32",
+    "eot_token_id": null,
+    "image_height": 28,
+    "image_width": 28,
+    "label_vocab_size": 11,
+    "mask_token_id": null,
+    "model_type": "image_dit",
+    "noise_epsilon": 0.001,
+    "null_label_id": 10,
+    "num_heads": 16,
+    "num_layers": 8,
+    "pixel_bins": 256,
+    "random_trunc_prob": 0.0,
+    "rope_theta": 10000.0,
+    "use_rope_2d": true,
+    "vocab_size": 257
+  },
+  "optimizer": {
+    "betas": [
+      0.9,
+      0.95
+    ],
+    "cosine_cycle_iters": 60000,
+    "eps": 1e-08,
+    "grad_clip_max_l2_norm": 3.0,
+    "initial_learning_rate": 0.0001,
+    "lr_schedule": "constant_with_warmup",
+    "max_learning_rate": 0.003,
+    "min_learning_rate": 0.0003,
+    "muon": null,
+    "optimizer_name": "adamw",
+    "warmup_iters": 200,
+    "weight_decay": 0.1
+  },
+  "train_infer": null,
+  "training": {
+    "amp_dtype": "bfloat16",
+    "amp_enabled": true,
+    "batch_size": 256,
+    "deterministic_mask": false,
+    "eot_mask_loss": false,
+    "grad_accum_steps": 1,
+    "joint_alpha_schedule": "constant",
+    "joint_alpha_schedule_end": 1.0,
+    "joint_alpha_schedule_start": 0.0,
+    "joint_diffusion_alpha": 0.3,
+    "joint_diffusion_alpha_end": null,
+    "max_train_iteration": 120000,
+    "max_val_iteration": 10,
+    "objective": "flow",
+    "p_mask_end": null,
+    "p_mask_override": null,
+    "p_mask_schedule": "none",
+    "p_mask_schedule_end": 1.0,
+    "p_mask_schedule_start": 0.0,
+    "p_mask_start": null,
+    "repeat_masking_seed": null,
+    "seed": 3407,
+    "skip_validation": false,
+    "train_loss_ema_decay": 0.99,
+    "uncond_label_dropout_prob": 0.1,
+    "val_freq_iteration": 250
+  },
+  "wandb": {
+    "architecture": null,
+    "dataset": null,
+    "entity": "yiltro8-org",
+    "project": "mnist_flow"
+  }
+}

config/train.toml ADDED Viewed

	@@ -0,0 +1,90 @@

+[model]
+model_type = "image_dit"
+label_vocab_size = 11
+vocab_size = 257
+pixel_bins = 256
+context_length = 784
+d_model = 256
+num_layers = 8
+num_heads = 16
+d_ff = 1024
+rope_theta = 10000.0
+attention_backend = "torch_sdpa"
+attention_sdp_backend = "auto"
+device = "cuda"
+dtype = "float32"
+null_label_id = 10
+random_trunc_prob = 0.0
+use_rope_2d = true
+image_height = 28
+image_width = 28
+[optimizer]
+optimizer_name = "adamw"
+betas = [0.9, 0.95]
+eps = 1e-8
+weight_decay = 0.1
+initial_learning_rate = 0.0001
+max_learning_rate = 0.003
+min_learning_rate = 0.0003
+warmup_iters = 200
+cosine_cycle_iters = 60000
+grad_clip_max_l2_norm = 3.0
+lr_schedule = "constant_with_warmup"
+[training]
+batch_size = 256
+max_train_iteration = 120000
+max_val_iteration = 10
+val_freq_iteration = 250
+seed = 3407
+skip_validation = false
+grad_accum_steps = 1
+train_loss_ema_decay = 0.99
+amp_enabled = true
+amp_dtype = "bfloat16"
+objective = "flow"
+uncond_label_dropout_prob = 0.1
+[data]
+runs_path = "./runs"
+dataset_name = "ylecun/mnist"
+train_split = "train"
+val_split = "test"
+text_field = "image"
+pipeline_mode = "mnist"
+shuffle_buffer_size = 0
+cache_all = true
+shuffle_seed = 3407
+[logging]
+backend = "wandb"
+architecture = "DiTImageFlow"
+dataset = "MNIST"
+log_activation_norms = false
+log_weight_norms = true
+log_grad_norms = true
+log_p_mask_bucket_loss = false
+val_log_every = 0
+val_log_samples = 0
+[wandb]
+entity = "yiltro8-org"
+project = "mnist_flow"
+[ddp]
+backend = "nccl"
+num_nodes = 1
+num_gpus_per_node = 1
+node_rank = 0
+master_addr = "127.0.0.1"
+master_port = "29500"
+bucket_size_mb = 200
+nccl_p2p_disable = true
+[checkpointing]
+enabled = true
+ckpting_save_iter = 1000
+resume_optimizer = true
+best_metric_name = "val_loss"
+best_mode = "min"

manifest.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+  "aliases": {
+    "best": {
+      "metric_name": "val_loss",
+      "mode": "min",
+      "status": "active",
+      "step": 7000,
+      "value": 0.15839263796806335,
+      "version_id": "v007000"
+    },
+    "latest": {
+      "step": 8000,
+      "version_id": "v008000"
+    }
+  },
+  "config": {
+    "bytes": 1727,
+    "key": "runs/2026-03-08_12-52-45/config/train.toml",
+    "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
+  },
+  "created_at": "2026-03-08T12:52:48.972713Z",
+  "paths": {
+    "layout_version": 1,
+    "root_local": "runs/2026-03-08_12-52-45"
+  },
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "versions": [
+    {
+      "created_at": "2026-03-08T13:15:30.591640Z",
+      "metrics": {
+        "val_loss": 0.19012750685214996
+      },
+      "model_key": "runs/2026-03-08_12-52-45/versions/v001000/model.safetensors",
+      "step": 1000,
+      "version_id": "v001000"
+    },
+    {
+      "created_at": "2026-03-08T13:37:49.696556Z",
+      "metrics": {
+        "val_loss": 0.16816174983978271
+      },
+      "model_key": "runs/2026-03-08_12-52-45/versions/v002000/model.safetensors",
+      "step": 2000,
+      "version_id": "v002000"
+    },
+    {
+      "created_at": "2026-03-08T14:00:08.773296Z",
+      "metrics": {
+        "val_loss": 0.16450278460979462
+      },
+      "model_key": "runs/2026-03-08_12-52-45/versions/v003000/model.safetensors",
+      "step": 3000,
+      "version_id": "v003000"
+    },
+    {
+      "created_at": "2026-03-08T14:22:26.642793Z",
+      "metrics": {
+        "val_loss": 0.15943044424057007
+      },
+      "model_key": "runs/2026-03-08_12-52-45/versions/v004000/model.safetensors",
+      "step": 4000,
+      "version_id": "v004000"
+    },
+    {
+      "created_at": "2026-03-08T14:44:45.703082Z",
+      "metrics": {
+        "val_loss": 0.15903371572494507
+      },
+      "model_key": "runs/2026-03-08_12-52-45/versions/v005000/model.safetensors",
+      "step": 5000,
+      "version_id": "v005000"
+    },
+    {
+      "created_at": "2026-03-08T15:07:04.573166Z",
+      "metrics": {
+        "val_loss": 0.16576382517814636
+      },
+      "model_key": "runs/2026-03-08_12-52-45/versions/v006000/model.safetensors",
+      "step": 6000,
+      "version_id": "v006000"
+    },
+    {
+      "created_at": "2026-03-08T15:29:21.920990Z",
+      "metrics": {
+        "val_loss": 0.15839263796806335
+      },
+      "model_key": "runs/2026-03-08_12-52-45/versions/v007000/model.safetensors",
+      "step": 7000,
+      "version_id": "v007000"
+    },
+    {
+      "created_at": "2026-03-08T15:51:37.216665Z",
+      "metrics": {
+        "val_loss": 0.15889549255371094
+      },
+      "model_key": "runs/2026-03-08_12-52-45/versions/v008000/model.safetensors",
+      "step": 8000,
+      "version_id": "v008000"
+    }
+  ]
+}

versions/v001000/manifest.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "amp_scaler": null,
+  "code": {},
+  "config": {
+    "bytes": 1727,
+    "key": "runs/2026-03-08_12-52-45/config/train.toml",
+    "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
+  },
+  "created_at": "2026-03-08T13:15:30.591640Z",
+  "metrics": {
+    "val_loss": 0.19012750685214996
+  },
+  "model": {
+    "bytes": 42255608,
+    "key": "runs/2026-03-08_12-52-45/versions/v001000/model.safetensors",
+    "sha256": "a8fb58f5c19addc5a21b315e02add6f2d7fc2a64e3c054d0d4e4155fe5e88476"
+  },
+  "optimizer": {
+    "sharding": "custom",
+    "shards": [
+      {
+        "bytes": 84561723,
+        "key": "runs/2026-03-08_12-52-45/versions/v001000/opt_shard_rank0000.bin",
+        "rank": 0,
+        "sha256": "acfcefbfb530727eb5e00d94a223027711916ca65510ac25c7ac1c50a8c6e558"
+      }
+    ]
+  },
+  "paths": {
+    "layout_version": 1,
+    "root_local": "runs/2026-03-08_12-52-45"
+  },
+  "resume": {
+    "base_step": 1001,
+    "exact": true
+  },
+  "rng": {
+    "keys": [
+      {
+        "key": "runs/2026-03-08_12-52-45/versions/v001000/rng_rank0000.json",
+        "rank": 0
+      }
+    ],
+    "per_rank": true
+  },
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "step": 1000,
+  "version_id": "v001000"
+}

versions/v001000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8fb58f5c19addc5a21b315e02add6f2d7fc2a64e3c054d0d4e4155fe5e88476
+size 42255608

versions/v001000/opt_shard_rank0000.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:acfcefbfb530727eb5e00d94a223027711916ca65510ac25c7ac1c50a8c6e558
+size 84561723

versions/v001000/rng_rank0000.json ADDED Viewed

The diff for this file is too large to render. See raw diff

versions/v002000/manifest.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "amp_scaler": null,
+  "code": {},
+  "config": {
+    "bytes": 1727,
+    "key": "runs/2026-03-08_12-52-45/config/train.toml",
+    "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
+  },
+  "created_at": "2026-03-08T13:37:49.696556Z",
+  "metrics": {
+    "val_loss": 0.16816174983978271
+  },
+  "model": {
+    "bytes": 42255608,
+    "key": "runs/2026-03-08_12-52-45/versions/v002000/model.safetensors",
+    "sha256": "eb1b891ccd36e9f026e49e5fcdd7c18e3bb7bc1dadba8de9bef0cbafed54e752"
+  },
+  "optimizer": {
+    "sharding": "custom",
+    "shards": [
+      {
+        "bytes": 84561723,
+        "key": "runs/2026-03-08_12-52-45/versions/v002000/opt_shard_rank0000.bin",
+        "rank": 0,
+        "sha256": "3ed59bc8dc7eeb8fb4884027d41e109cf11994f2c6180d87fb786b2494cbfd02"
+      }
+    ]
+  },
+  "paths": {
+    "layout_version": 1,
+    "root_local": "runs/2026-03-08_12-52-45"
+  },
+  "resume": {
+    "base_step": 2001,
+    "exact": true
+  },
+  "rng": {
+    "keys": [
+      {
+        "key": "runs/2026-03-08_12-52-45/versions/v002000/rng_rank0000.json",
+        "rank": 0
+      }
+    ],
+    "per_rank": true
+  },
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "step": 2000,
+  "version_id": "v002000"
+}

versions/v002000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb1b891ccd36e9f026e49e5fcdd7c18e3bb7bc1dadba8de9bef0cbafed54e752
+size 42255608

versions/v002000/opt_shard_rank0000.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ed59bc8dc7eeb8fb4884027d41e109cf11994f2c6180d87fb786b2494cbfd02
+size 84561723

versions/v002000/rng_rank0000.json ADDED Viewed

The diff for this file is too large to render. See raw diff

versions/v003000/manifest.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "amp_scaler": null,
+  "code": {},
+  "config": {
+    "bytes": 1727,
+    "key": "runs/2026-03-08_12-52-45/config/train.toml",
+    "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
+  },
+  "created_at": "2026-03-08T14:00:08.773296Z",
+  "metrics": {
+    "val_loss": 0.16450278460979462
+  },
+  "model": {
+    "bytes": 42255608,
+    "key": "runs/2026-03-08_12-52-45/versions/v003000/model.safetensors",
+    "sha256": "02a48a588479301bdf67189ebd4a71501a12a8362a668deeb36392a5498c5757"
+  },
+  "optimizer": {
+    "sharding": "custom",
+    "shards": [
+      {
+        "bytes": 84561723,
+        "key": "runs/2026-03-08_12-52-45/versions/v003000/opt_shard_rank0000.bin",
+        "rank": 0,
+        "sha256": "1697a8a22f8da31b290bf28ca102d83db676ff8cd209f35821eb3592952b18c9"
+      }
+    ]
+  },
+  "paths": {
+    "layout_version": 1,
+    "root_local": "runs/2026-03-08_12-52-45"
+  },
+  "resume": {
+    "base_step": 3001,
+    "exact": true
+  },
+  "rng": {
+    "keys": [
+      {
+        "key": "runs/2026-03-08_12-52-45/versions/v003000/rng_rank0000.json",
+        "rank": 0
+      }
+    ],
+    "per_rank": true
+  },
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "step": 3000,
+  "version_id": "v003000"
+}

versions/v003000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02a48a588479301bdf67189ebd4a71501a12a8362a668deeb36392a5498c5757
+size 42255608

versions/v003000/opt_shard_rank0000.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1697a8a22f8da31b290bf28ca102d83db676ff8cd209f35821eb3592952b18c9
+size 84561723

versions/v003000/rng_rank0000.json ADDED Viewed

The diff for this file is too large to render. See raw diff

versions/v004000/manifest.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "amp_scaler": null,
+  "code": {},
+  "config": {
+    "bytes": 1727,
+    "key": "runs/2026-03-08_12-52-45/config/train.toml",
+    "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
+  },
+  "created_at": "2026-03-08T14:22:26.642793Z",
+  "metrics": {
+    "val_loss": 0.15943044424057007
+  },
+  "model": {
+    "bytes": 42255608,
+    "key": "runs/2026-03-08_12-52-45/versions/v004000/model.safetensors",
+    "sha256": "20fdc6efe3886dab442a9d6ce8ff340fca5450c06db0ebc2a0fec4c72527c299"
+  },
+  "optimizer": {
+    "sharding": "custom",
+    "shards": [
+      {
+        "bytes": 84561723,
+        "key": "runs/2026-03-08_12-52-45/versions/v004000/opt_shard_rank0000.bin",
+        "rank": 0,
+        "sha256": "b2ea7c4229e7f15fc8df06cc4aa4826568a902a19222fdae96d6809cccbb452c"
+      }
+    ]
+  },
+  "paths": {
+    "layout_version": 1,
+    "root_local": "runs/2026-03-08_12-52-45"
+  },
+  "resume": {
+    "base_step": 4001,
+    "exact": true
+  },
+  "rng": {
+    "keys": [
+      {
+        "key": "runs/2026-03-08_12-52-45/versions/v004000/rng_rank0000.json",
+        "rank": 0
+      }
+    ],
+    "per_rank": true
+  },
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "step": 4000,
+  "version_id": "v004000"
+}

versions/v004000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20fdc6efe3886dab442a9d6ce8ff340fca5450c06db0ebc2a0fec4c72527c299
+size 42255608

versions/v004000/opt_shard_rank0000.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2ea7c4229e7f15fc8df06cc4aa4826568a902a19222fdae96d6809cccbb452c
+size 84561723

versions/v004000/rng_rank0000.json ADDED Viewed

The diff for this file is too large to render. See raw diff

versions/v005000/manifest.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "amp_scaler": null,
+  "code": {},
+  "config": {
+    "bytes": 1727,
+    "key": "runs/2026-03-08_12-52-45/config/train.toml",
+    "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
+  },
+  "created_at": "2026-03-08T14:44:45.703082Z",
+  "metrics": {
+    "val_loss": 0.15903371572494507
+  },
+  "model": {
+    "bytes": 42255608,
+    "key": "runs/2026-03-08_12-52-45/versions/v005000/model.safetensors",
+    "sha256": "28c6c1419029bde3d80261904b955f8f4b584c6d7553976020ed830dd524c4b0"
+  },
+  "optimizer": {
+    "sharding": "custom",
+    "shards": [
+      {
+        "bytes": 84561723,
+        "key": "runs/2026-03-08_12-52-45/versions/v005000/opt_shard_rank0000.bin",
+        "rank": 0,
+        "sha256": "2fd9c186811cb3c2e02687a8836b8c7210a7e63b2942cbb545c2206dff03d410"
+      }
+    ]
+  },
+  "paths": {
+    "layout_version": 1,
+    "root_local": "runs/2026-03-08_12-52-45"
+  },
+  "resume": {
+    "base_step": 5001,
+    "exact": true
+  },
+  "rng": {
+    "keys": [
+      {
+        "key": "runs/2026-03-08_12-52-45/versions/v005000/rng_rank0000.json",
+        "rank": 0
+      }
+    ],
+    "per_rank": true
+  },
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "step": 5000,
+  "version_id": "v005000"
+}

versions/v005000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28c6c1419029bde3d80261904b955f8f4b584c6d7553976020ed830dd524c4b0
+size 42255608

versions/v005000/opt_shard_rank0000.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fd9c186811cb3c2e02687a8836b8c7210a7e63b2942cbb545c2206dff03d410
+size 84561723

versions/v005000/rng_rank0000.json ADDED Viewed

The diff for this file is too large to render. See raw diff

versions/v006000/manifest.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "amp_scaler": null,
+  "code": {},
+  "config": {
+    "bytes": 1727,
+    "key": "runs/2026-03-08_12-52-45/config/train.toml",
+    "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
+  },
+  "created_at": "2026-03-08T15:07:04.573166Z",
+  "metrics": {
+    "val_loss": 0.16576382517814636
+  },
+  "model": {
+    "bytes": 42255608,
+    "key": "runs/2026-03-08_12-52-45/versions/v006000/model.safetensors",
+    "sha256": "4a45329c6cf33a7859a7b5176abe1e9ddd93b2808bcb8ef904dfa8d9da2a0628"
+  },
+  "optimizer": {
+    "sharding": "custom",
+    "shards": [
+      {
+        "bytes": 84561723,
+        "key": "runs/2026-03-08_12-52-45/versions/v006000/opt_shard_rank0000.bin",
+        "rank": 0,
+        "sha256": "230764e6ed078a33d9020095686d1ee471abce777071c68b30c18f800eecfe29"
+      }
+    ]
+  },
+  "paths": {
+    "layout_version": 1,
+    "root_local": "runs/2026-03-08_12-52-45"
+  },
+  "resume": {
+    "base_step": 6001,
+    "exact": true
+  },
+  "rng": {
+    "keys": [
+      {
+        "key": "runs/2026-03-08_12-52-45/versions/v006000/rng_rank0000.json",
+        "rank": 0
+      }
+    ],
+    "per_rank": true
+  },
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "step": 6000,
+  "version_id": "v006000"
+}

versions/v006000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a45329c6cf33a7859a7b5176abe1e9ddd93b2808bcb8ef904dfa8d9da2a0628
+size 42255608

versions/v006000/opt_shard_rank0000.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:230764e6ed078a33d9020095686d1ee471abce777071c68b30c18f800eecfe29
+size 84561723

versions/v006000/rng_rank0000.json ADDED Viewed

The diff for this file is too large to render. See raw diff

versions/v007000/manifest.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "amp_scaler": null,
+  "code": {},
+  "config": {
+    "bytes": 1727,
+    "key": "runs/2026-03-08_12-52-45/config/train.toml",
+    "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
+  },
+  "created_at": "2026-03-08T15:29:21.920990Z",
+  "metrics": {
+    "val_loss": 0.15839263796806335
+  },
+  "model": {
+    "bytes": 42255608,
+    "key": "runs/2026-03-08_12-52-45/versions/v007000/model.safetensors",
+    "sha256": "a2aab911a7a6732e696b6dcb65909f1dbfacc4575157bb083d9700d2be3cfce1"
+  },
+  "optimizer": {
+    "sharding": "custom",
+    "shards": [
+      {
+        "bytes": 84561723,
+        "key": "runs/2026-03-08_12-52-45/versions/v007000/opt_shard_rank0000.bin",
+        "rank": 0,
+        "sha256": "7832cd4f8a3e30dba3ab050f90b1deb18334fa75bd53d90412f1447ad36898e8"
+      }
+    ]
+  },
+  "paths": {
+    "layout_version": 1,
+    "root_local": "runs/2026-03-08_12-52-45"
+  },
+  "resume": {
+    "base_step": 7001,
+    "exact": true
+  },
+  "rng": {
+    "keys": [
+      {
+        "key": "runs/2026-03-08_12-52-45/versions/v007000/rng_rank0000.json",
+        "rank": 0
+      }
+    ],
+    "per_rank": true
+  },
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "step": 7000,
+  "version_id": "v007000"
+}

versions/v007000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2aab911a7a6732e696b6dcb65909f1dbfacc4575157bb083d9700d2be3cfce1
+size 42255608

versions/v007000/opt_shard_rank0000.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7832cd4f8a3e30dba3ab050f90b1deb18334fa75bd53d90412f1447ad36898e8
+size 84561723

versions/v007000/rng_rank0000.json ADDED Viewed

The diff for this file is too large to render. See raw diff

versions/v008000/manifest.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "amp_scaler": null,
+  "code": {},
+  "config": {
+    "bytes": 1727,
+    "key": "runs/2026-03-08_12-52-45/config/train.toml",
+    "sha256": "f10153c45a6c87d4494ccb805acc6d4cb46d2ec93fa8c647d4de379f16acfd08"
+  },
+  "created_at": "2026-03-08T15:51:37.216665Z",
+  "metrics": {
+    "val_loss": 0.15889549255371094
+  },
+  "model": {
+    "bytes": 42255608,
+    "key": "runs/2026-03-08_12-52-45/versions/v008000/model.safetensors",
+    "sha256": "7930eda42795bcc2cabb42e7ce6a44a4a20a409289cfd18a95daab2bf4ead9b3"
+  },
+  "optimizer": {
+    "sharding": "custom",
+    "shards": [
+      {
+        "bytes": 84561723,
+        "key": "runs/2026-03-08_12-52-45/versions/v008000/opt_shard_rank0000.bin",
+        "rank": 0,
+        "sha256": "c6a78c31cdcbdf6eb19de87c477c3cd40c50cbd29222e0eaef1014f5895d249b"
+      }
+    ]
+  },
+  "paths": {
+    "layout_version": 1,
+    "root_local": "runs/2026-03-08_12-52-45"
+  },
+  "resume": {
+    "base_step": 8001,
+    "exact": true
+  },
+  "rng": {
+    "keys": [
+      {
+        "key": "runs/2026-03-08_12-52-45/versions/v008000/rng_rank0000.json",
+        "rank": 0
+      }
+    ],
+    "per_rank": true
+  },
+  "run_id": "2026-03-08_12-52-45",
+  "schema_version": 1,
+  "step": 8000,
+  "version_id": "v008000"
+}

versions/v008000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7930eda42795bcc2cabb42e7ce6a44a4a20a409289cfd18a95daab2bf4ead9b3
+size 42255608

versions/v008000/opt_shard_rank0000.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6a78c31cdcbdf6eb19de87c477c3cd40c50cbd29222e0eaef1014f5895d249b
+size 84561723

versions/v008000/rng_rank0000.json ADDED Viewed

The diff for this file is too large to render. See raw diff