Skip to content

Meridian.AI Train #2060

Meridian.AI Train

Meridian.AI Train #2060

Workflow file for this run

name: Meridian.AI Train
on:
schedule:
# Every hour, every day
- cron: '0 * * * *'
workflow_dispatch:
inputs:
force_seed:
description: 'Nuke & re-seed HF repo with fresh model?'
type: boolean
default: false
max_steps:
description: 'Training steps per run (default: 150)'
type: string
default: '150'
env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
concurrency:
group: meridian-train-${{ github.ref }}
cancel-in-progress: false
jobs:
seed:
name: "Nuke & Seed HF"
if: github.event.inputs.force_seed == 'true'
runs-on: ubuntu-latest
environment: HuggingFace Hub
steps:
- uses: actions/checkout@v4
with:
token: ${{ secrets.GH_PAT }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install deps
run: |
pip install huggingface_hub transformers torch safetensors sentencepiece python-dotenv
- name: Nuke & Seed
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B'
PYTHONPATH: .
run: python scripts/seed_hf_repo.py
train:
name: "Hourly Training Run"
needs: seed
if: |
always() &&
(needs.seed.result == 'success' || needs.seed.result == 'skipped')
runs-on: ubuntu-latest
timeout-minutes: 90 # 1.5 hours max
permissions:
contents: write
issues: write
env:
# Persisted across runs via actions/cache so the Qwen base model + tokenizer are
# downloaded once. When HF returns 429, transformers falls back to this cache.
HF_HOME: ${{ github.workspace }}/.hf_cache
HF_HUB_DOWNLOAD_TIMEOUT: '60'
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.GH_PAT }}
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
- name: Cache HuggingFace base model
uses: actions/cache@v4
with:
path: ${{ github.workspace }}/.hf_cache
key: hf-cache-qwen2.5-0.5b-v1
restore-keys: |
hf-cache-
- name: Install Dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install ruff black
- name: Lint & Format
run: |
black . --quiet
ruff check . --fix --quiet
- name: Pull Checkpoint from HuggingFace
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
# Be patient and gentle with the Hub: longer per-request timeout, and the
# download itself uses few concurrent workers (high concurrency is what
# trips HF's 429 rate limiter on shared GitHub Actions IPs).
HF_HUB_DOWNLOAD_TIMEOUT: '60'
# Without this, Python buffers stdout and Actions flushes it all at the end,
# making every retry line share one timestamp (looks like no backoff happened).
PYTHONUNBUFFERED: '1'
run: |
python -c "
from huggingface_hub import snapshot_download
import os, shutil, time
repo_id = 'meridianal/FinAI'
token = os.getenv('HF_TOKEN')
# Only fetch the checkpoint/ folder (skips README, datasets, etc.) — fewer files
# means fewer metadata HEAD calls, the requests HF is 429-ing. We download
# directly and do NOT call list_repo_files first: that extra tree API call is
# itself 429-prone, and a 429 there used to drop us into 'start fresh', which
# makes train.py download the Qwen base (429-fails) and risks overwriting the
# good Hub checkpoint with a from-scratch model.
ALLOW = ['checkpoint/**']
IGNORE = ['checkpoint/pytorch_model.bin', '**/pytorch_model.bin']
# The pull is the single point of failure for the whole run (we abort rather
# than train from scratch), and HF/CloudFront blips can last several minutes.
# 8 attempts with capped exponential backoff = ~10+20+40+80+120+120+120s of
# waiting (~10 min total window) before giving up on a transient outage.
temp_dir = './temp_download'
delay = 10
attempts = 8
last_err = None
for attempt in range(1, attempts + 1):
try:
snapshot_download(
repo_id=repo_id,
local_dir=temp_dir,
token=token,
allow_patterns=ALLOW,
ignore_patterns=IGNORE,
max_workers=2,
)
last_err = None
break
except Exception as _e:
last_err = _e
print(f' Checkpoint pull attempt {attempt}/{attempts} failed: {_e}')
if attempt < attempts:
print(f' Retrying in {delay}s...')
time.sleep(delay)
delay = min(delay * 2, 120)
# A download exception after all retries is FATAL: the repo reliably holds the
# rolling checkpoint, so the only safe action is to abort rather than train from
# scratch and clobber it. Failing the job is better than a silent regression.
if last_err is not None:
raise SystemExit(f'FATAL: checkpoint pull failed after retries: {last_err}')
if os.path.exists(os.path.join(temp_dir, 'checkpoint')):
if os.path.exists('./checkpoint'):
shutil.rmtree('./checkpoint')
shutil.move(os.path.join(temp_dir, 'checkpoint'), './checkpoint')
print('Checkpoint pulled')
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
for cf in ['model.safetensors', 'trainer_state.pt', 'config.json', 'ewc_state.pt']:
path = os.path.join('./checkpoint', cf)
if os.path.exists(path):
size_mb = os.path.getsize(path) / (1024 * 1024)
print(f' - {cf} ({size_mb:.2f} MB)')
else:
print(f' - {cf} NOT FOUND')
# snapshot_download succeeded but no weights -> the repo genuinely has no
# checkpoint yet (only legitimate before the first seed). Otherwise abort: a
# missing model.safetensors here would force a Hub base download (429) and a
# from-scratch upload over the good checkpoint.
if not os.path.exists('./checkpoint/model.safetensors'):
if os.getenv('ALLOW_FRESH_START') == '1':
print('No checkpoint in repo and ALLOW_FRESH_START=1 — starting fresh.')
else:
raise SystemExit(
'FATAL: no checkpoint/model.safetensors after a successful pull. '
'Set ALLOW_FRESH_START=1 to intentionally start from the base model.'
)
"
- name: Train
id: training
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
COMET_API_KEY: ${{ secrets.COMET_API_KEY }}
TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B'
DTYPE: 'bfloat16'
SKIP_OPTIMIZER_SAVE: '1'
HARD_RAM_GUARD: '1'
# The real OOM cause was the data pipeline, not the model: 25 streaming datasets
# each with a 2000-item shuffle buffer grew RSS past 15GB. With SHUFFLE_BUFFER
# small (below), steady usage is ~10-11GB, so these caps give room to actually
# train and only intervene near a genuine OOM. (model step peak ~5.75GB RSS,
# pipeline ~5GB — both measured.)
MAX_RAM_GB: '14.5'
SOFT_RAM_GB: '12.5'
SOFT_RAM_PCT: '80'
MIN_THROTTLE_SEQ_LEN: '64'
# Streaming-shuffle look-ahead per dataset; small because ~25 run concurrently.
SHUFFLE_BUFFER: '128'
DEBUG_STEPS: '0'
MAX_STEPS: ${{ github.event.inputs.max_steps || '150' }}
TOTAL_STEPS: '100000'
# GRAD_ACCUM=1: on the slow CPU runner a single checkpointed backward could
# take minutes, so accumulating 4 micro-steps never committed an optimizer
# step before the RAM guard tripped -> "0 items trained" every run. With
# accum=1 every micro-step IS an optimizer step, so progress (global_step,
# checkpoint, loss curve) advances and is saved even if the run is cut short.
GRAD_ACCUM: '1'
BATCH_SIZE: '1'
LEARNING_RATE: '5e-5'
# Reduced 384 -> 256: activation memory scales with sequence length and is the
# dominant term in the backward-pass peak on the CPU runner. 256 still gives the
# model meaningful context while keeping peak RAM well under the 13GB hard cap.
BLOCK_SIZE: '256'
MAX_BYTES: '26214400'
USE_EWC: '1'
EWC_LAMBDA: '75.0'
EWC_SAMPLES: '20'
# OFF: at batch=1/block=256 checkpointing saves only ~300MB of activations
# but ~doubles each (already slow) CPU backward by recomputing the forward.
# That recompute was the reason a step never finished in time. Memory is
# bounded by the guards + small shuffle buffer instead.
GRADIENT_CHECKPOINTING: '0'
OPTIMIZER: 'adafactor'
FREE_OPTIMIZER_BEFORE_FISHER: '1'
FISHER_SEQ_LEN: '64'
FISHER_THRESHOLD: '5e-4'
FISHER_TOP_K_RATIO: '0.08'
USE_LIGHT_DATASETS: '0'
# Stream at most 1 heavy dataset (fineweb-edu / OpenMathInstruct-2) at a
# time; their multi-GB Parquet row-groups were filling the 16GB runner.
# Which heavy dataset is active rotates per run, so both are still trained on.
MAX_HEAVY_CONCURRENT: '1'
# fineweb-edu's huge Parquet row-groups blow the 16GB runner even alone
# (runs that rotated to it trained only 2-3 steps before the guard fired,
# vs ~50 for OpenMathInstruct-2). Drop it here; re-enable on a bigger runner.
EXCLUDE_DATASETS: 'HuggingFaceFW/fineweb-edu'
GC_EVERY_STEPS: '2'
# Collect freed Arrow row-groups every few micro-steps so streaming memory
# can't creep to the guard before the first optimizer step runs any gc.
GC_EVERY_MICROSTEPS: '4'
PYTHONPATH: .
PYTHONUNBUFFERED: '1'
run: |
timeout 4800 python train.py 2>&1 | tee train_output.log
TRAIN_EXIT=${PIPESTATUS[0]}
echo "train_exit_code=$TRAIN_EXIT" >> "$GITHUB_OUTPUT"
# Detect repeated errors (10+ occurrences of same error = systemic failure)
ERROR_COUNT=$(grep -c '\[ERROR\]' train_output.log 2>/dev/null) || ERROR_COUNT=0
echo "error_count=$ERROR_COUNT" >> "$GITHUB_OUTPUT"
# Extract unique error messages for the issue body
grep '\[ERROR\]' train_output.log | sort -u | head -10 > train_errors.txt 2>/dev/null || true
# Detect fatal patterns (NaN loss explosion, OOM, etc.)
FATAL=$(grep -cE 'CUDA out of memory|Loss is NaN|RuntimeError|FATAL' train_output.log 2>/dev/null) || FATAL=0
echo "fatal_count=$FATAL" >> "$GITHUB_OUTPUT"
if [ "$ERROR_COUNT" -gt 50 ] || [ "$FATAL" -gt 0 ]; then
echo "train_failed=true" >> "$GITHUB_OUTPUT"
echo "[FAIL] Training had $ERROR_COUNT errors, $FATAL fatal issues."
else
echo "train_failed=false" >> "$GITHUB_OUTPUT"
fi
# Still save checkpoint even on errors
exit 0
- name: Auto-create issue on training failure
if: steps.training.outputs.train_failed == 'true'
env:
GH_TOKEN: ${{ secrets.GH_PAT }}
run: |
ERROR_COUNT=${{ steps.training.outputs.error_count }}
FATAL_COUNT=${{ steps.training.outputs.fatal_count }}
EXIT_CODE=${{ steps.training.outputs.train_exit_code }}
RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
# Read unique errors
ERRORS=$(cat train_errors.txt 2>/dev/null | head -10 || echo "No error details captured")
# Check for existing open issue to avoid duplicates
EXISTING=$(gh issue list --state open --label "training-failure" --limit 1 --json number -q '.[0].number' 2>/dev/null || echo "")
BODY="$(cat <<EOF
## Training Failure Report
**Run:** ${RUN_URL}
**Date:** $(date -u +%Y-%m-%dT%H:%M:%SZ)
**Exit Code:** ${EXIT_CODE}
**Error Count:** ${ERROR_COUNT}
**Fatal Count:** ${FATAL_COUNT}
### Unique Errors
\`\`\`
${ERRORS}
\`\`\`
### Action Required
- [ ] Investigate root cause
- [ ] Check if EWC state file is stale (shape mismatch)
- [ ] Check model architecture hasn't changed
- [ ] Re-run training after fix
*Auto-generated by CI*
EOF
)"
if [ -n "$EXISTING" ]; then
echo "Appending to existing issue #$EXISTING"
gh issue comment "$EXISTING" --body "$BODY"
else
gh issue create \
--title "Training failure: ${ERROR_COUNT} errors detected ($(date -u +%Y-%m-%d))" \
--body "$BODY" \
--label "training-failure,bug"
fi
- name: Upload Checkpoint to HuggingFace
if: always()
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
python -c "
from huggingface_hub import HfApi
import os
if not os.path.exists('./checkpoint'):
print('No checkpoint found, skipping')
exit(0)
api = HfApi()
token = os.getenv('HF_TOKEN')
repo_id = 'meridianal/FinAI'
print(f'Uploading checkpoint to {repo_id}...')
try:
api.upload_folder(
folder_path='./checkpoint',
repo_id=repo_id,
path_in_repo='checkpoint',
commit_message='Hourly training update [skip ci]',
token=token,
delete_patterns=['pytorch_model.bin', '**/pytorch_model.bin'],
)
print('Checkpoint upload successful')
except Exception as e:
print(f'Checkpoint upload failed: {e}')
exit(1)
# Upload model card to HF repo root (MODEL_CARD.md carries the HF YAML frontmatter;
# the repo README.md is the GitHub landing page and intentionally has no frontmatter)
model_card_path = './MODEL_CARD.md'
if os.path.exists(model_card_path):
print('Uploading model card...')
try:
api.upload_file(
path_or_fileobj=model_card_path,
path_in_repo='README.md',
repo_id=repo_id,
commit_message='Update model card [skip ci]',
token=token,
)
print('Model card upload successful')
except Exception as e:
print(f'Model card upload failed (non-fatal): {e}')
else:
print('No MODEL_CARD.md found, skipping model card upload')
"
- name: Generation Smoke Test
if: always()
run: |
python -c "
import os, sys, time, torch
ckpt = './checkpoint'
if not os.path.exists(os.path.join(ckpt, 'model.safetensors')):
print('[SKIP] No checkpoint found — skipping smoke test')
sys.exit(0)
try:
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModelForCausalLM.from_pretrained(
ckpt, torch_dtype=torch.float32, low_cpu_mem_usage=True
)
model.eval()
prompts = [
'### Instruction:\nWhat is the Black-Scholes model used for?\n\n### Response:\n',
'### Instruction:\nExplain compound interest briefly.\n\n### Response:\n',
]
print()
all_ok = True
for i, prompt in enumerate(prompts, 1):
inputs = tokenizer(prompt, return_tensors='pt')
t0 = time.time()
with torch.no_grad():
out = model.generate(
**inputs,
max_new_tokens=80,
do_sample=False,
repetition_penalty=1.1,
pad_token_id=tokenizer.eos_token_id,
)
elapsed = time.time() - t0
new_tokens = out.shape[1] - inputs['input_ids'].shape[1]
response = tokenizer.decode(out[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
# Quality checks
words = response.split()
unique_ratio = len(set(words)) / max(len(words), 1)
ok = new_tokens >= 20 and unique_ratio >= 0.4
status = '[OK]' if ok else '[WARN]'
print(f'Prompt {i}: {status} {new_tokens} new tokens, {unique_ratio:.2f} unique ratio, {elapsed:.1f}s')
print(f' Response: {response[:120].strip()!r}')
if not ok:
all_ok = False
print()
print('[OK] Smoke test complete' if all_ok else '[WARN] Some prompts produced low-quality output')
except Exception as e:
print(f'[WARN] Smoke test failed (non-fatal): {e}')
"
- name: Sync Dataset State
if: always()
env:
GH_PAT: ${{ secrets.GH_PAT }}
run: |
git config --local user.email "action@github.com"
git config --local user.name "Meridian.AI Bot"
git add .
git diff --staged --quiet || git commit -m "chore: sync dataset state & formatting [skip ci]"
git pull --rebase -X theirs origin main || (git rebase --abort && git pull --no-rebase origin main)
git push https://$GH_PAT@github.com/MeridianAlgo/FinAI.git main