Meridian.AI Train #2060
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Meridian.AI Train | |
| on: | |
| schedule: | |
| # Every hour, every day | |
| - cron: '0 * * * *' | |
| workflow_dispatch: | |
| inputs: | |
| force_seed: | |
| description: 'Nuke & re-seed HF repo with fresh model?' | |
| type: boolean | |
| default: false | |
| max_steps: | |
| description: 'Training steps per run (default: 150)' | |
| type: string | |
| default: '150' | |
| env: | |
| FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true | |
| concurrency: | |
| group: meridian-train-${{ github.ref }} | |
| cancel-in-progress: false | |
| jobs: | |
| seed: | |
| name: "Nuke & Seed HF" | |
| if: github.event.inputs.force_seed == 'true' | |
| runs-on: ubuntu-latest | |
| environment: HuggingFace Hub | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| token: ${{ secrets.GH_PAT }} | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| - name: Install deps | |
| run: | | |
| pip install huggingface_hub transformers torch safetensors sentencepiece python-dotenv | |
| - name: Nuke & Seed | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B' | |
| PYTHONPATH: . | |
| run: python scripts/seed_hf_repo.py | |
| train: | |
| name: "Hourly Training Run" | |
| needs: seed | |
| if: | | |
| always() && | |
| (needs.seed.result == 'success' || needs.seed.result == 'skipped') | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 90 # 1.5 hours max | |
| permissions: | |
| contents: write | |
| issues: write | |
| env: | |
| # Persisted across runs via actions/cache so the Qwen base model + tokenizer are | |
| # downloaded once. When HF returns 429, transformers falls back to this cache. | |
| HF_HOME: ${{ github.workspace }}/.hf_cache | |
| HF_HUB_DOWNLOAD_TIMEOUT: '60' | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| token: ${{ secrets.GH_PAT }} | |
| - name: Set up Python 3.11 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| cache: 'pip' | |
| - name: Cache HuggingFace base model | |
| uses: actions/cache@v4 | |
| with: | |
| path: ${{ github.workspace }}/.hf_cache | |
| key: hf-cache-qwen2.5-0.5b-v1 | |
| restore-keys: | | |
| hf-cache- | |
| - name: Install Dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install -r requirements.txt | |
| pip install ruff black | |
| - name: Lint & Format | |
| run: | | |
| black . --quiet | |
| ruff check . --fix --quiet | |
| - name: Pull Checkpoint from HuggingFace | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| # Be patient and gentle with the Hub: longer per-request timeout, and the | |
| # download itself uses few concurrent workers (high concurrency is what | |
| # trips HF's 429 rate limiter on shared GitHub Actions IPs). | |
| HF_HUB_DOWNLOAD_TIMEOUT: '60' | |
| # Without this, Python buffers stdout and Actions flushes it all at the end, | |
| # making every retry line share one timestamp (looks like no backoff happened). | |
| PYTHONUNBUFFERED: '1' | |
| run: | | |
| python -c " | |
| from huggingface_hub import snapshot_download | |
| import os, shutil, time | |
| repo_id = 'meridianal/FinAI' | |
| token = os.getenv('HF_TOKEN') | |
| # Only fetch the checkpoint/ folder (skips README, datasets, etc.) — fewer files | |
| # means fewer metadata HEAD calls, the requests HF is 429-ing. We download | |
| # directly and do NOT call list_repo_files first: that extra tree API call is | |
| # itself 429-prone, and a 429 there used to drop us into 'start fresh', which | |
| # makes train.py download the Qwen base (429-fails) and risks overwriting the | |
| # good Hub checkpoint with a from-scratch model. | |
| ALLOW = ['checkpoint/**'] | |
| IGNORE = ['checkpoint/pytorch_model.bin', '**/pytorch_model.bin'] | |
| # The pull is the single point of failure for the whole run (we abort rather | |
| # than train from scratch), and HF/CloudFront blips can last several minutes. | |
| # 8 attempts with capped exponential backoff = ~10+20+40+80+120+120+120s of | |
| # waiting (~10 min total window) before giving up on a transient outage. | |
| temp_dir = './temp_download' | |
| delay = 10 | |
| attempts = 8 | |
| last_err = None | |
| for attempt in range(1, attempts + 1): | |
| try: | |
| snapshot_download( | |
| repo_id=repo_id, | |
| local_dir=temp_dir, | |
| token=token, | |
| allow_patterns=ALLOW, | |
| ignore_patterns=IGNORE, | |
| max_workers=2, | |
| ) | |
| last_err = None | |
| break | |
| except Exception as _e: | |
| last_err = _e | |
| print(f' Checkpoint pull attempt {attempt}/{attempts} failed: {_e}') | |
| if attempt < attempts: | |
| print(f' Retrying in {delay}s...') | |
| time.sleep(delay) | |
| delay = min(delay * 2, 120) | |
| # A download exception after all retries is FATAL: the repo reliably holds the | |
| # rolling checkpoint, so the only safe action is to abort rather than train from | |
| # scratch and clobber it. Failing the job is better than a silent regression. | |
| if last_err is not None: | |
| raise SystemExit(f'FATAL: checkpoint pull failed after retries: {last_err}') | |
| if os.path.exists(os.path.join(temp_dir, 'checkpoint')): | |
| if os.path.exists('./checkpoint'): | |
| shutil.rmtree('./checkpoint') | |
| shutil.move(os.path.join(temp_dir, 'checkpoint'), './checkpoint') | |
| print('Checkpoint pulled') | |
| if os.path.exists(temp_dir): | |
| shutil.rmtree(temp_dir) | |
| for cf in ['model.safetensors', 'trainer_state.pt', 'config.json', 'ewc_state.pt']: | |
| path = os.path.join('./checkpoint', cf) | |
| if os.path.exists(path): | |
| size_mb = os.path.getsize(path) / (1024 * 1024) | |
| print(f' - {cf} ({size_mb:.2f} MB)') | |
| else: | |
| print(f' - {cf} NOT FOUND') | |
| # snapshot_download succeeded but no weights -> the repo genuinely has no | |
| # checkpoint yet (only legitimate before the first seed). Otherwise abort: a | |
| # missing model.safetensors here would force a Hub base download (429) and a | |
| # from-scratch upload over the good checkpoint. | |
| if not os.path.exists('./checkpoint/model.safetensors'): | |
| if os.getenv('ALLOW_FRESH_START') == '1': | |
| print('No checkpoint in repo and ALLOW_FRESH_START=1 — starting fresh.') | |
| else: | |
| raise SystemExit( | |
| 'FATAL: no checkpoint/model.safetensors after a successful pull. ' | |
| 'Set ALLOW_FRESH_START=1 to intentionally start from the base model.' | |
| ) | |
| " | |
| - name: Train | |
| id: training | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| COMET_API_KEY: ${{ secrets.COMET_API_KEY }} | |
| TOKENIZER_ID: 'Qwen/Qwen2.5-0.5B' | |
| DTYPE: 'bfloat16' | |
| SKIP_OPTIMIZER_SAVE: '1' | |
| HARD_RAM_GUARD: '1' | |
| # The real OOM cause was the data pipeline, not the model: 25 streaming datasets | |
| # each with a 2000-item shuffle buffer grew RSS past 15GB. With SHUFFLE_BUFFER | |
| # small (below), steady usage is ~10-11GB, so these caps give room to actually | |
| # train and only intervene near a genuine OOM. (model step peak ~5.75GB RSS, | |
| # pipeline ~5GB — both measured.) | |
| MAX_RAM_GB: '14.5' | |
| SOFT_RAM_GB: '12.5' | |
| SOFT_RAM_PCT: '80' | |
| MIN_THROTTLE_SEQ_LEN: '64' | |
| # Streaming-shuffle look-ahead per dataset; small because ~25 run concurrently. | |
| SHUFFLE_BUFFER: '128' | |
| DEBUG_STEPS: '0' | |
| MAX_STEPS: ${{ github.event.inputs.max_steps || '150' }} | |
| TOTAL_STEPS: '100000' | |
| # GRAD_ACCUM=1: on the slow CPU runner a single checkpointed backward could | |
| # take minutes, so accumulating 4 micro-steps never committed an optimizer | |
| # step before the RAM guard tripped -> "0 items trained" every run. With | |
| # accum=1 every micro-step IS an optimizer step, so progress (global_step, | |
| # checkpoint, loss curve) advances and is saved even if the run is cut short. | |
| GRAD_ACCUM: '1' | |
| BATCH_SIZE: '1' | |
| LEARNING_RATE: '5e-5' | |
| # Reduced 384 -> 256: activation memory scales with sequence length and is the | |
| # dominant term in the backward-pass peak on the CPU runner. 256 still gives the | |
| # model meaningful context while keeping peak RAM well under the 13GB hard cap. | |
| BLOCK_SIZE: '256' | |
| MAX_BYTES: '26214400' | |
| USE_EWC: '1' | |
| EWC_LAMBDA: '75.0' | |
| EWC_SAMPLES: '20' | |
| # OFF: at batch=1/block=256 checkpointing saves only ~300MB of activations | |
| # but ~doubles each (already slow) CPU backward by recomputing the forward. | |
| # That recompute was the reason a step never finished in time. Memory is | |
| # bounded by the guards + small shuffle buffer instead. | |
| GRADIENT_CHECKPOINTING: '0' | |
| OPTIMIZER: 'adafactor' | |
| FREE_OPTIMIZER_BEFORE_FISHER: '1' | |
| FISHER_SEQ_LEN: '64' | |
| FISHER_THRESHOLD: '5e-4' | |
| FISHER_TOP_K_RATIO: '0.08' | |
| USE_LIGHT_DATASETS: '0' | |
| # Stream at most 1 heavy dataset (fineweb-edu / OpenMathInstruct-2) at a | |
| # time; their multi-GB Parquet row-groups were filling the 16GB runner. | |
| # Which heavy dataset is active rotates per run, so both are still trained on. | |
| MAX_HEAVY_CONCURRENT: '1' | |
| # fineweb-edu's huge Parquet row-groups blow the 16GB runner even alone | |
| # (runs that rotated to it trained only 2-3 steps before the guard fired, | |
| # vs ~50 for OpenMathInstruct-2). Drop it here; re-enable on a bigger runner. | |
| EXCLUDE_DATASETS: 'HuggingFaceFW/fineweb-edu' | |
| GC_EVERY_STEPS: '2' | |
| # Collect freed Arrow row-groups every few micro-steps so streaming memory | |
| # can't creep to the guard before the first optimizer step runs any gc. | |
| GC_EVERY_MICROSTEPS: '4' | |
| PYTHONPATH: . | |
| PYTHONUNBUFFERED: '1' | |
| run: | | |
| timeout 4800 python train.py 2>&1 | tee train_output.log | |
| TRAIN_EXIT=${PIPESTATUS[0]} | |
| echo "train_exit_code=$TRAIN_EXIT" >> "$GITHUB_OUTPUT" | |
| # Detect repeated errors (10+ occurrences of same error = systemic failure) | |
| ERROR_COUNT=$(grep -c '\[ERROR\]' train_output.log 2>/dev/null) || ERROR_COUNT=0 | |
| echo "error_count=$ERROR_COUNT" >> "$GITHUB_OUTPUT" | |
| # Extract unique error messages for the issue body | |
| grep '\[ERROR\]' train_output.log | sort -u | head -10 > train_errors.txt 2>/dev/null || true | |
| # Detect fatal patterns (NaN loss explosion, OOM, etc.) | |
| FATAL=$(grep -cE 'CUDA out of memory|Loss is NaN|RuntimeError|FATAL' train_output.log 2>/dev/null) || FATAL=0 | |
| echo "fatal_count=$FATAL" >> "$GITHUB_OUTPUT" | |
| if [ "$ERROR_COUNT" -gt 50 ] || [ "$FATAL" -gt 0 ]; then | |
| echo "train_failed=true" >> "$GITHUB_OUTPUT" | |
| echo "[FAIL] Training had $ERROR_COUNT errors, $FATAL fatal issues." | |
| else | |
| echo "train_failed=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| # Still save checkpoint even on errors | |
| exit 0 | |
| - name: Auto-create issue on training failure | |
| if: steps.training.outputs.train_failed == 'true' | |
| env: | |
| GH_TOKEN: ${{ secrets.GH_PAT }} | |
| run: | | |
| ERROR_COUNT=${{ steps.training.outputs.error_count }} | |
| FATAL_COUNT=${{ steps.training.outputs.fatal_count }} | |
| EXIT_CODE=${{ steps.training.outputs.train_exit_code }} | |
| RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| # Read unique errors | |
| ERRORS=$(cat train_errors.txt 2>/dev/null | head -10 || echo "No error details captured") | |
| # Check for existing open issue to avoid duplicates | |
| EXISTING=$(gh issue list --state open --label "training-failure" --limit 1 --json number -q '.[0].number' 2>/dev/null || echo "") | |
| BODY="$(cat <<EOF | |
| ## Training Failure Report | |
| **Run:** ${RUN_URL} | |
| **Date:** $(date -u +%Y-%m-%dT%H:%M:%SZ) | |
| **Exit Code:** ${EXIT_CODE} | |
| **Error Count:** ${ERROR_COUNT} | |
| **Fatal Count:** ${FATAL_COUNT} | |
| ### Unique Errors | |
| \`\`\` | |
| ${ERRORS} | |
| \`\`\` | |
| ### Action Required | |
| - [ ] Investigate root cause | |
| - [ ] Check if EWC state file is stale (shape mismatch) | |
| - [ ] Check model architecture hasn't changed | |
| - [ ] Re-run training after fix | |
| *Auto-generated by CI* | |
| EOF | |
| )" | |
| if [ -n "$EXISTING" ]; then | |
| echo "Appending to existing issue #$EXISTING" | |
| gh issue comment "$EXISTING" --body "$BODY" | |
| else | |
| gh issue create \ | |
| --title "Training failure: ${ERROR_COUNT} errors detected ($(date -u +%Y-%m-%d))" \ | |
| --body "$BODY" \ | |
| --label "training-failure,bug" | |
| fi | |
| - name: Upload Checkpoint to HuggingFace | |
| if: always() | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| python -c " | |
| from huggingface_hub import HfApi | |
| import os | |
| if not os.path.exists('./checkpoint'): | |
| print('No checkpoint found, skipping') | |
| exit(0) | |
| api = HfApi() | |
| token = os.getenv('HF_TOKEN') | |
| repo_id = 'meridianal/FinAI' | |
| print(f'Uploading checkpoint to {repo_id}...') | |
| try: | |
| api.upload_folder( | |
| folder_path='./checkpoint', | |
| repo_id=repo_id, | |
| path_in_repo='checkpoint', | |
| commit_message='Hourly training update [skip ci]', | |
| token=token, | |
| delete_patterns=['pytorch_model.bin', '**/pytorch_model.bin'], | |
| ) | |
| print('Checkpoint upload successful') | |
| except Exception as e: | |
| print(f'Checkpoint upload failed: {e}') | |
| exit(1) | |
| # Upload model card to HF repo root (MODEL_CARD.md carries the HF YAML frontmatter; | |
| # the repo README.md is the GitHub landing page and intentionally has no frontmatter) | |
| model_card_path = './MODEL_CARD.md' | |
| if os.path.exists(model_card_path): | |
| print('Uploading model card...') | |
| try: | |
| api.upload_file( | |
| path_or_fileobj=model_card_path, | |
| path_in_repo='README.md', | |
| repo_id=repo_id, | |
| commit_message='Update model card [skip ci]', | |
| token=token, | |
| ) | |
| print('Model card upload successful') | |
| except Exception as e: | |
| print(f'Model card upload failed (non-fatal): {e}') | |
| else: | |
| print('No MODEL_CARD.md found, skipping model card upload') | |
| " | |
| - name: Generation Smoke Test | |
| if: always() | |
| run: | | |
| python -c " | |
| import os, sys, time, torch | |
| ckpt = './checkpoint' | |
| if not os.path.exists(os.path.join(ckpt, 'model.safetensors')): | |
| print('[SKIP] No checkpoint found — skipping smoke test') | |
| sys.exit(0) | |
| try: | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(ckpt) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| ckpt, torch_dtype=torch.float32, low_cpu_mem_usage=True | |
| ) | |
| model.eval() | |
| prompts = [ | |
| '### Instruction:\nWhat is the Black-Scholes model used for?\n\n### Response:\n', | |
| '### Instruction:\nExplain compound interest briefly.\n\n### Response:\n', | |
| ] | |
| print() | |
| all_ok = True | |
| for i, prompt in enumerate(prompts, 1): | |
| inputs = tokenizer(prompt, return_tensors='pt') | |
| t0 = time.time() | |
| with torch.no_grad(): | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens=80, | |
| do_sample=False, | |
| repetition_penalty=1.1, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| elapsed = time.time() - t0 | |
| new_tokens = out.shape[1] - inputs['input_ids'].shape[1] | |
| response = tokenizer.decode(out[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) | |
| # Quality checks | |
| words = response.split() | |
| unique_ratio = len(set(words)) / max(len(words), 1) | |
| ok = new_tokens >= 20 and unique_ratio >= 0.4 | |
| status = '[OK]' if ok else '[WARN]' | |
| print(f'Prompt {i}: {status} {new_tokens} new tokens, {unique_ratio:.2f} unique ratio, {elapsed:.1f}s') | |
| print(f' Response: {response[:120].strip()!r}') | |
| if not ok: | |
| all_ok = False | |
| print() | |
| print('[OK] Smoke test complete' if all_ok else '[WARN] Some prompts produced low-quality output') | |
| except Exception as e: | |
| print(f'[WARN] Smoke test failed (non-fatal): {e}') | |
| " | |
| - name: Sync Dataset State | |
| if: always() | |
| env: | |
| GH_PAT: ${{ secrets.GH_PAT }} | |
| run: | | |
| git config --local user.email "action@github.com" | |
| git config --local user.name "Meridian.AI Bot" | |
| git add . | |
| git diff --staged --quiet || git commit -m "chore: sync dataset state & formatting [skip ci]" | |
| git pull --rebase -X theirs origin main || (git rebase --abort && git pull --no-rebase origin main) | |
| git push https://$GH_PAT@github.com/MeridianAlgo/FinAI.git main |