Meridian.AI Train #2060

Workflow file for this run

	name: Meridian.AI Train

	on:
	schedule:
	# Every hour, every day
	- cron: '0 * * * *'
	workflow_dispatch:
	inputs:
	force_seed:
	description: 'Nuke & re-seed HF repo with fresh model?'
	type: boolean
	default: false
	max_steps:
	description: 'Training steps per run (default: 150)'
	type: string
	default: '150'

	env:
	FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

	concurrency:
	group: meridian-train-${{ github.ref }}
	cancel-in-progress: false

	jobs:
	seed:
	name: "Nuke & Seed HF"
	if: github.event.inputs.force_seed == 'true'
	runs-on: ubuntu-latest
	environment: HuggingFace Hub
	steps:
	- uses: actions/checkout@v4
	with:
	token: ${{ secrets.GH_PAT }}

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install deps
	run: \|
	pip install huggingface_hub transformers torch safetensors sentencepiece python-dotenv

	- name: Nuke & Seed
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B'
	PYTHONPATH: .
	run: python scripts/seed_hf_repo.py

	train:
	name: "Hourly Training Run"
	needs: seed
	if: \|
	always() &&
	(needs.seed.result == 'success' \|\| needs.seed.result == 'skipped')
	runs-on: ubuntu-latest
	timeout-minutes: 90 # 1.5 hours max
	permissions:
	contents: write
	issues: write
	env:
	# Persisted across runs via actions/cache so the Qwen base model + tokenizer are
	# downloaded once. When HF returns 429, transformers falls back to this cache.
	HF_HOME: ${{ github.workspace }}/.hf_cache
	HF_HUB_DOWNLOAD_TIMEOUT: '60'

	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0
	token: ${{ secrets.GH_PAT }}

	- name: Set up Python 3.11
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'
	cache: 'pip'

	- name: Cache HuggingFace base model
	uses: actions/cache@v4
	with:
	path: ${{ github.workspace }}/.hf_cache
	key: hf-cache-qwen2.5-0.5b-v1
	restore-keys: \|
	hf-cache-

	- name: Install Dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install -r requirements.txt
	pip install ruff black

	- name: Lint & Format
	run: \|
	black . --quiet
	ruff check . --fix --quiet

	- name: Pull Checkpoint from HuggingFace
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	# Be patient and gentle with the Hub: longer per-request timeout, and the
	# download itself uses few concurrent workers (high concurrency is what
	# trips HF's 429 rate limiter on shared GitHub Actions IPs).
	HF_HUB_DOWNLOAD_TIMEOUT: '60'
	# Without this, Python buffers stdout and Actions flushes it all at the end,
	# making every retry line share one timestamp (looks like no backoff happened).
	PYTHONUNBUFFERED: '1'
	run: \|
	python -c "
	from huggingface_hub import snapshot_download
	import os, shutil, time

	repo_id = 'meridianal/FinAI'
	token = os.getenv('HF_TOKEN')

	# Only fetch the checkpoint/ folder (skips README, datasets, etc.) — fewer files
	# means fewer metadata HEAD calls, the requests HF is 429-ing. We download
	# directly and do NOT call list_repo_files first: that extra tree API call is
	# itself 429-prone, and a 429 there used to drop us into 'start fresh', which
	# makes train.py download the Qwen base (429-fails) and risks overwriting the
	# good Hub checkpoint with a from-scratch model.
	ALLOW = ['checkpoint/**']
	IGNORE = ['checkpoint/pytorch_model.bin', '**/pytorch_model.bin']

	# The pull is the single point of failure for the whole run (we abort rather
	# than train from scratch), and HF/CloudFront blips can last several minutes.
	# 8 attempts with capped exponential backoff = ~10+20+40+80+120+120+120s of
	# waiting (~10 min total window) before giving up on a transient outage.
	temp_dir = './temp_download'
	delay = 10
	attempts = 8
	last_err = None
	for attempt in range(1, attempts + 1):
	try:
	snapshot_download(
	repo_id=repo_id,
	local_dir=temp_dir,
	token=token,
	allow_patterns=ALLOW,
	ignore_patterns=IGNORE,
	max_workers=2,
	)
	last_err = None
	break
	except Exception as _e:
	last_err = _e
	print(f' Checkpoint pull attempt {attempt}/{attempts} failed: {_e}')
	if attempt < attempts:
	print(f' Retrying in {delay}s...')
	time.sleep(delay)
	delay = min(delay * 2, 120)

	# A download exception after all retries is FATAL: the repo reliably holds the
	# rolling checkpoint, so the only safe action is to abort rather than train from
	# scratch and clobber it. Failing the job is better than a silent regression.
	if last_err is not None:
	raise SystemExit(f'FATAL: checkpoint pull failed after retries: {last_err}')

	if os.path.exists(os.path.join(temp_dir, 'checkpoint')):
	if os.path.exists('./checkpoint'):
	shutil.rmtree('./checkpoint')
	shutil.move(os.path.join(temp_dir, 'checkpoint'), './checkpoint')
	print('Checkpoint pulled')
	if os.path.exists(temp_dir):
	shutil.rmtree(temp_dir)

	for cf in ['model.safetensors', 'trainer_state.pt', 'config.json', 'ewc_state.pt']:
	path = os.path.join('./checkpoint', cf)
	if os.path.exists(path):
	size_mb = os.path.getsize(path) / (1024 * 1024)
	print(f' - {cf} ({size_mb:.2f} MB)')
	else:
	print(f' - {cf} NOT FOUND')

	# snapshot_download succeeded but no weights -> the repo genuinely has no
	# checkpoint yet (only legitimate before the first seed). Otherwise abort: a
	# missing model.safetensors here would force a Hub base download (429) and a
	# from-scratch upload over the good checkpoint.
	if not os.path.exists('./checkpoint/model.safetensors'):
	if os.getenv('ALLOW_FRESH_START') == '1':
	print('No checkpoint in repo and ALLOW_FRESH_START=1 — starting fresh.')
	else:
	raise SystemExit(
	'FATAL: no checkpoint/model.safetensors after a successful pull. '
	'Set ALLOW_FRESH_START=1 to intentionally start from the base model.'
	)
	"

	- name: Train
	id: training
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	COMET_API_KEY: ${{ secrets.COMET_API_KEY }}
	TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B'
	DTYPE: 'bfloat16'
	SKIP_OPTIMIZER_SAVE: '1'
	HARD_RAM_GUARD: '1'
	# The real OOM cause was the data pipeline, not the model: 25 streaming datasets
	# each with a 2000-item shuffle buffer grew RSS past 15GB. With SHUFFLE_BUFFER
	# small (below), steady usage is ~10-11GB, so these caps give room to actually
	# train and only intervene near a genuine OOM. (model step peak ~5.75GB RSS,
	# pipeline ~5GB — both measured.)
	MAX_RAM_GB: '14.5'
	SOFT_RAM_GB: '12.5'
	SOFT_RAM_PCT: '80'
	MIN_THROTTLE_SEQ_LEN: '64'
	# Streaming-shuffle look-ahead per dataset; small because ~25 run concurrently.
	SHUFFLE_BUFFER: '128'
	DEBUG_STEPS: '0'
	MAX_STEPS: ${{ github.event.inputs.max_steps \|\| '150' }}
	TOTAL_STEPS: '100000'
	# GRAD_ACCUM=1: on the slow CPU runner a single checkpointed backward could
	# take minutes, so accumulating 4 micro-steps never committed an optimizer
	# step before the RAM guard tripped -> "0 items trained" every run. With
	# accum=1 every micro-step IS an optimizer step, so progress (global_step,
	# checkpoint, loss curve) advances and is saved even if the run is cut short.
	GRAD_ACCUM: '1'
	BATCH_SIZE: '1'
	LEARNING_RATE: '5e-5'
	# Reduced 384 -> 256: activation memory scales with sequence length and is the
	# dominant term in the backward-pass peak on the CPU runner. 256 still gives the
	# model meaningful context while keeping peak RAM well under the 13GB hard cap.
	BLOCK_SIZE: '256'
	MAX_BYTES: '26214400'
	USE_EWC: '1'
	EWC_LAMBDA: '75.0'
	EWC_SAMPLES: '20'
	# OFF: at batch=1/block=256 checkpointing saves only ~300MB of activations
	# but ~doubles each (already slow) CPU backward by recomputing the forward.
	# That recompute was the reason a step never finished in time. Memory is
	# bounded by the guards + small shuffle buffer instead.
	GRADIENT_CHECKPOINTING: '0'
	OPTIMIZER: 'adafactor'
	FREE_OPTIMIZER_BEFORE_FISHER: '1'
	FISHER_SEQ_LEN: '64'
	FISHER_THRESHOLD: '5e-4'
	FISHER_TOP_K_RATIO: '0.08'
	USE_LIGHT_DATASETS: '0'
	# Stream at most 1 heavy dataset (fineweb-edu / OpenMathInstruct-2) at a
	# time; their multi-GB Parquet row-groups were filling the 16GB runner.
	# Which heavy dataset is active rotates per run, so both are still trained on.
	MAX_HEAVY_CONCURRENT: '1'
	# fineweb-edu's huge Parquet row-groups blow the 16GB runner even alone
	# (runs that rotated to it trained only 2-3 steps before the guard fired,
	# vs ~50 for OpenMathInstruct-2). Drop it here; re-enable on a bigger runner.
	EXCLUDE_DATASETS: 'HuggingFaceFW/fineweb-edu'
	GC_EVERY_STEPS: '2'
	# Collect freed Arrow row-groups every few micro-steps so streaming memory
	# can't creep to the guard before the first optimizer step runs any gc.
	GC_EVERY_MICROSTEPS: '4'
	PYTHONPATH: .
	PYTHONUNBUFFERED: '1'
	run: \|
	timeout 4800 python train.py 2>&1 \| tee train_output.log
	TRAIN_EXIT=${PIPESTATUS[0]}
	echo "train_exit_code=$TRAIN_EXIT" >> "$GITHUB_OUTPUT"

	# Detect repeated errors (10+ occurrences of same error = systemic failure)
	ERROR_COUNT=$(grep -c '\[ERROR\]' train_output.log 2>/dev/null) \|\| ERROR_COUNT=0
	echo "error_count=$ERROR_COUNT" >> "$GITHUB_OUTPUT"

	# Extract unique error messages for the issue body
	grep '\[ERROR\]' train_output.log \| sort -u \| head -10 > train_errors.txt 2>/dev/null \|\| true

	# Detect fatal patterns (NaN loss explosion, OOM, etc.)
	FATAL=$(grep -cE 'CUDA out of memory\|Loss is NaN\|RuntimeError\|FATAL' train_output.log 2>/dev/null) \|\| FATAL=0
	echo "fatal_count=$FATAL" >> "$GITHUB_OUTPUT"

	if [ "$ERROR_COUNT" -gt 50 ] \|\| [ "$FATAL" -gt 0 ]; then
	echo "train_failed=true" >> "$GITHUB_OUTPUT"
	echo "[FAIL] Training had $ERROR_COUNT errors, $FATAL fatal issues."
	else
	echo "train_failed=false" >> "$GITHUB_OUTPUT"
	fi

	# Still save checkpoint even on errors
	exit 0

	- name: Auto-create issue on training failure
	if: steps.training.outputs.train_failed == 'true'
	env:
	GH_TOKEN: ${{ secrets.GH_PAT }}
	run: \|
	ERROR_COUNT=${{ steps.training.outputs.error_count }}
	FATAL_COUNT=${{ steps.training.outputs.fatal_count }}
	EXIT_CODE=${{ steps.training.outputs.train_exit_code }}
	RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"

	# Read unique errors
	ERRORS=$(cat train_errors.txt 2>/dev/null \| head -10 \|\| echo "No error details captured")

	# Check for existing open issue to avoid duplicates
	EXISTING=$(gh issue list --state open --label "training-failure" --limit 1 --json number -q '.[0].number' 2>/dev/null \|\| echo "")

	BODY="$(cat <<EOF
	## Training Failure Report

	Run: ${RUN_URL}
	Date: $(date -u +%Y-%m-%dT%H:%M:%SZ)
	Exit Code: ${EXIT_CODE}
	Error Count: ${ERROR_COUNT}
	Fatal Count: ${FATAL_COUNT}

	### Unique Errors
	\`\`\`
	${ERRORS}
	\`\`\`

	### Action Required
	- [ ] Investigate root cause
	- [ ] Check if EWC state file is stale (shape mismatch)
	- [ ] Check model architecture hasn't changed
	- [ ] Re-run training after fix

	Auto-generated by CI
	EOF
	)"

	if [ -n "$EXISTING" ]; then
	echo "Appending to existing issue #$EXISTING"
	gh issue comment "$EXISTING" --body "$BODY"
	else
	gh issue create \
	--title "Training failure: ${ERROR_COUNT} errors detected ($(date -u +%Y-%m-%d))" \
	--body "$BODY" \
	--label "training-failure,bug"
	fi

	- name: Upload Checkpoint to HuggingFace
	if: always()
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	python -c "
	from huggingface_hub import HfApi
	import os

	if not os.path.exists('./checkpoint'):
	print('No checkpoint found, skipping')
	exit(0)

	api = HfApi()
	token = os.getenv('HF_TOKEN')
	repo_id = 'meridianal/FinAI'
	print(f'Uploading checkpoint to {repo_id}...')
	try:
	api.upload_folder(
	folder_path='./checkpoint',
	repo_id=repo_id,
	path_in_repo='checkpoint',
	commit_message='Hourly training update [skip ci]',
	token=token,
	delete_patterns=['pytorch_model.bin', '**/pytorch_model.bin'],
	)
	print('Checkpoint upload successful')
	except Exception as e:
	print(f'Checkpoint upload failed: {e}')
	exit(1)

	# Upload model card to HF repo root (MODEL_CARD.md carries the HF YAML frontmatter;
	# the repo README.md is the GitHub landing page and intentionally has no frontmatter)
	model_card_path = './MODEL_CARD.md'
	if os.path.exists(model_card_path):
	print('Uploading model card...')
	try:
	api.upload_file(
	path_or_fileobj=model_card_path,
	path_in_repo='README.md',
	repo_id=repo_id,
	commit_message='Update model card [skip ci]',
	token=token,
	)
	print('Model card upload successful')
	except Exception as e:
	print(f'Model card upload failed (non-fatal): {e}')
	else:
	print('No MODEL_CARD.md found, skipping model card upload')
	"

	- name: Generation Smoke Test
	if: always()
	run: \|
	python -c "
	import os, sys, time, torch

	ckpt = './checkpoint'
	if not os.path.exists(os.path.join(ckpt, 'model.safetensors')):
	print('[SKIP] No checkpoint found — skipping smoke test')
	sys.exit(0)

	try:
	from transformers import AutoModelForCausalLM, AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained(ckpt)
	model = AutoModelForCausalLM.from_pretrained(
	ckpt, torch_dtype=torch.float32, low_cpu_mem_usage=True
	)
	model.eval()

	prompts = [
	'### Instruction:\nWhat is the Black-Scholes model used for?\n\n### Response:\n',
	'### Instruction:\nExplain compound interest briefly.\n\n### Response:\n',
	]

	print()
	all_ok = True
	for i, prompt in enumerate(prompts, 1):
	inputs = tokenizer(prompt, return_tensors='pt')
	t0 = time.time()
	with torch.no_grad():
	out = model.generate(
	**inputs,
	max_new_tokens=80,
	do_sample=False,
	repetition_penalty=1.1,
	pad_token_id=tokenizer.eos_token_id,
	)
	elapsed = time.time() - t0
	new_tokens = out.shape[1] - inputs['input_ids'].shape[1]
	response = tokenizer.decode(out[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

	# Quality checks
	words = response.split()
	unique_ratio = len(set(words)) / max(len(words), 1)
	ok = new_tokens >= 20 and unique_ratio >= 0.4

	status = '[OK]' if ok else '[WARN]'
	print(f'Prompt {i}: {status} {new_tokens} new tokens, {unique_ratio:.2f} unique ratio, {elapsed:.1f}s')
	print(f' Response: {response[:120].strip()!r}')
	if not ok:
	all_ok = False

	print()
	print('[OK] Smoke test complete' if all_ok else '[WARN] Some prompts produced low-quality output')
	except Exception as e:
	print(f'[WARN] Smoke test failed (non-fatal): {e}')
	"

	- name: Sync Dataset State
	if: always()
	env:
	GH_PAT: ${{ secrets.GH_PAT }}
	run: \|
	git config --local user.email "action@github.com"
	git config --local user.name "Meridian.AI Bot"
	git add .
	git diff --staged --quiet \|\| git commit -m "chore: sync dataset state & formatting [skip ci]"
	git pull --rebase -X theirs origin main \|\| (git rebase --abort && git pull --no-rebase origin main)
	git push https://$GH_PAT@github.com/MeridianAlgo/FinAI.git main

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Meridian.AI Train #2060

Workflow file

Meridian.AI Train #2060

Uh oh!

Workflow file for this run