Files
git.stella-ops.org/docs/modules/binary-index/golden-corpus-maintenance.md
2026-01-22 19:08:46 +02:00

13 KiB

Golden Corpus Maintenance

Sprint: SPRINT_20260121_036_BinaryIndex_golden_corpus_bundle_verification Task: GCB-006 - Document corpus folder layout and maintenance procedures

Overview

This document describes maintenance procedures for the golden corpus, including:

  • Mirror synchronization
  • Baseline management
  • Evidence bundle generation
  • Health monitoring

Mirror Synchronization

Automated Sync Schedule

Mirror sync should be automated via cron jobs or CI scheduled workflows.

Mirror Frequency Rationale
Debian archive Daily Security updates published daily
Debian buildinfo Daily Matches archive updates
Ubuntu archive Daily Security updates published daily
Ubuntu USN index Hourly USN metadata changes frequently
Alpine secdb Daily Less frequent updates
OSV database Hourly Aggregates multiple sources

Sync Scripts

Debian Mirror Sync

#!/bin/bash
# sync-debian-mirrors.sh
# Syncs Debian archives and buildinfo

set -euo pipefail

MIRRORS_ROOT="${MIRRORS_ROOT:-/data/golden-corpus/mirrors}"
DEBIAN_MIRROR="${DEBIAN_MIRROR:-https://snapshot.debian.org}"
BUILDINFO_URL="${BUILDINFO_URL:-https://buildinfos.debian.net}"

# Packages to mirror (security-relevant)
PACKAGES=(openssl curl zlib glibc libxml2 libpng)

# Sync source packages
for pkg in "${PACKAGES[@]}"; do
    echo "Syncing Debian sources for: $pkg"

    # Create package directory
    mkdir -p "$MIRRORS_ROOT/debian/archive/pool/main/${pkg:0:1}/$pkg"

    # Download available versions
    rsync -avz --progress \
        "rsync://snapshot.debian.org/snapshot/debian/pool/main/${pkg:0:1}/$pkg/" \
        "$MIRRORS_ROOT/debian/archive/pool/main/${pkg:0:1}/$pkg/"
done

# Sync buildinfo files
for pkg in "${PACKAGES[@]}"; do
    echo "Syncing buildinfo for: $pkg"

    mkdir -p "$MIRRORS_ROOT/debian/buildinfo/$pkg"

    # Use wget to fetch buildinfo index and files
    wget -r -np -nH --cut-dirs=2 -P "$MIRRORS_ROOT/debian/buildinfo/$pkg" \
        "$BUILDINFO_URL/api/v1/buildinfo/$pkg/" || true
done

echo "Debian mirror sync complete"
date > "$MIRRORS_ROOT/debian/.last-sync"

Ubuntu Mirror Sync

#!/bin/bash
# sync-ubuntu-mirrors.sh
# Syncs Ubuntu archives and USN metadata

set -euo pipefail

MIRRORS_ROOT="${MIRRORS_ROOT:-/data/golden-corpus/mirrors}"
UBUNTU_ARCHIVE="https://archive.ubuntu.com/ubuntu"
USN_API="https://ubuntu.com/security/notices.json"

# Sync USN database
echo "Syncing Ubuntu USN database..."
mkdir -p "$MIRRORS_ROOT/ubuntu/usn-index"
curl -sSL "$USN_API" -o "$MIRRORS_ROOT/ubuntu/usn-index/usn-db.json.tmp"
mv "$MIRRORS_ROOT/ubuntu/usn-index/usn-db.json.tmp" "$MIRRORS_ROOT/ubuntu/usn-index/usn-db.json"

# Sync packages (similar to Debian)
PACKAGES=(openssl curl zlib1g libxml2)

for pkg in "${PACKAGES[@]}"; do
    echo "Syncing Ubuntu sources for: $pkg"
    mkdir -p "$MIRRORS_ROOT/ubuntu/archive/pool/main/${pkg:0:1}/$pkg"
    # ... sync logic
done

echo "Ubuntu mirror sync complete"
date > "$MIRRORS_ROOT/ubuntu/.last-sync"

Alpine SecDB Sync

#!/bin/bash
# sync-alpine-secdb.sh
# Syncs Alpine security database

set -euo pipefail

MIRRORS_ROOT="${MIRRORS_ROOT:-/data/golden-corpus/mirrors}"
ALPINE_SECDB="https://secdb.alpinelinux.org"

mkdir -p "$MIRRORS_ROOT/alpine/secdb"

# Download all security databases
for branch in v3.17 v3.18 v3.19 v3.20 edge; do
    for repo in main community; do
        echo "Syncing Alpine secdb: $branch/$repo"
        curl -sSL "$ALPINE_SECDB/$branch/$repo.json" \
            -o "$MIRRORS_ROOT/alpine/secdb/${branch}-${repo}.json" || true
    done
done

echo "Alpine secdb sync complete"
date > "$MIRRORS_ROOT/alpine/.last-sync"

OSV Database Sync

#!/bin/bash
# sync-osv.sh
# Syncs OSV vulnerability database

set -euo pipefail

MIRRORS_ROOT="${MIRRORS_ROOT:-/data/golden-corpus/mirrors}"
OSV_URL="https://osv-vulnerabilities.storage.googleapis.com"

mkdir -p "$MIRRORS_ROOT/osv"

# Download full database
echo "Downloading OSV all.zip..."
curl -sSL "$OSV_URL/all.zip" -o "$MIRRORS_ROOT/osv/all.zip.tmp"
mv "$MIRRORS_ROOT/osv/all.zip.tmp" "$MIRRORS_ROOT/osv/all.zip"

# Extract ecosystem-specific databases
for ecosystem in Debian Ubuntu Alpine; do
    mkdir -p "$MIRRORS_ROOT/osv/$ecosystem"
    unzip -o -q "$MIRRORS_ROOT/osv/all.zip" "$ecosystem/*" -d "$MIRRORS_ROOT/osv/" || true
done

echo "OSV sync complete"
date > "$MIRRORS_ROOT/osv/.last-sync"

Cron Configuration

# /etc/cron.d/golden-corpus-sync

# Mirror sync jobs
0 */4 * * * corpus /opt/golden-corpus/scripts/sync-debian-mirrors.sh >> /var/log/corpus/debian-sync.log 2>&1
0 */4 * * * corpus /opt/golden-corpus/scripts/sync-ubuntu-mirrors.sh >> /var/log/corpus/ubuntu-sync.log 2>&1
0 6 * * *   corpus /opt/golden-corpus/scripts/sync-alpine-secdb.sh >> /var/log/corpus/alpine-sync.log 2>&1
0 * * * *   corpus /opt/golden-corpus/scripts/sync-osv.sh >> /var/log/corpus/osv-sync.log 2>&1

# Health check
*/15 * * * * corpus /opt/golden-corpus/scripts/check-mirror-health.sh >> /var/log/corpus/health.log 2>&1

Baseline Management

When to Update Baselines

Update the KPI baseline when:

  1. Algorithm improvements are merged (expected KPI improvement)
  2. New corpus pairs are added (may change baseline metrics)
  3. False positives/negatives are corrected in ground truth
  4. Major version upgrades of analysis tools

Baseline Update Procedure

1. Run Full Validation

# Run validation on the full corpus
stella groundtruth validate run \
    --matcher semantic-diffing \
    --output bench/results/$(date +%Y%m%d%H%M%S).json \
    --verbose

2. Review Results

# Check metrics
stella groundtruth validate metrics --run-id latest

# Compare against current baseline
stella groundtruth validate check \
    --results bench/results/latest.json \
    --baseline bench/baselines/current.json

3. Update Baseline

Only if regression check passes or improvements are expected:

# Archive current baseline
cp bench/baselines/current.json \
   bench/baselines/archive/baseline-$(date +%Y%m%d).json

# Update baseline
stella groundtruth baseline update \
    --from-results bench/results/latest.json \
    --output bench/baselines/current.json \
    --description "Post algorithm-v2.3 update" \
    --source "$(git rev-parse HEAD)"

4. Commit and Document

# Commit the baseline update
git add bench/baselines/
git commit -m "chore(bench): update golden corpus baseline

Reason: Algorithm v2.3 improvements
Previous baseline: baseline-20260115.json

Metrics:
- Precision: 0.95 -> 0.97 (+2pp)
- Recall: 0.92 -> 0.94 (+2pp)
- FN Rate: 0.08 -> 0.06 (-2pp)
- Determinism: 100%
- TTFRP p95: 150ms -> 140ms (-7%)"

git push

Baseline Rollback

If a baseline update causes issues:

# Restore previous baseline
cp bench/baselines/archive/baseline-20260115.json \
   bench/baselines/current.json

git add bench/baselines/current.json
git commit -m "revert(bench): rollback baseline to 20260115"
git push

Evidence Bundle Generation

Manual Bundle Export

# Export bundle for specific packages
stella groundtruth bundle export \
    --packages openssl,curl,zlib \
    --distros debian,ubuntu \
    --output evidence/security-bundle-$(date +%Y%m%d).tar.gz \
    --sign-with-cosign \
    --include-debug \
    --include-kpis \
    --include-timestamps

Automated Bundle Generation

Schedule bundle generation for compliance reporting:

#!/bin/bash
# generate-compliance-bundles.sh
# Run monthly for audit evidence

set -euo pipefail

EVIDENCE_DIR="/data/golden-corpus/evidence"
MONTH=$(date +%Y%m)

# Generate bundles for each distro
for distro in debian ubuntu alpine; do
    stella groundtruth bundle export \
        --distros "$distro" \
        --packages all \
        --output "$EVIDENCE_DIR/$distro-bundle-$MONTH.tar.gz" \
        --sign-with-cosign \
        --include-kpis \
        --include-timestamps
done

# Create manifest
echo "{\"month\": \"$MONTH\", \"bundles\": [\"debian\", \"ubuntu\", \"alpine\"]}" \
    > "$EVIDENCE_DIR/manifest-$MONTH.json"

Bundle Verification

Always verify bundles after generation:

# Verify bundle integrity
stella groundtruth bundle import \
    --input evidence/security-bundle-20260122.tar.gz \
    --verify \
    --trusted-keys /etc/stellaops/trusted-keys.pub \
    --trust-profile /etc/stellaops/trust-profiles/global.json \
    --output verification-report.md

Health Monitoring

Doctor Checks

Run Doctor checks regularly to validate corpus health:

# Run all corpus-related checks
stella doctor --check "check.binaryanalysis.corpus.*"

# Specific checks
stella doctor --check check.binaryanalysis.corpus.mirror.freshness
stella doctor --check check.binaryanalysis.corpus.kpi.baseline
stella doctor --check check.binaryanalysis.debuginfod.availability

Health Check Script

#!/bin/bash
# check-mirror-health.sh
# Validates mirror freshness and connectivity

set -euo pipefail

MIRRORS_ROOT="${MIRRORS_ROOT:-/data/golden-corpus/mirrors}"
STALE_THRESHOLD_DAYS=7
ALERTS=""

check_mirror() {
    local mirror_name=$1
    local last_sync_file=$2
    local max_age=$3

    if [[ ! -f "$last_sync_file" ]]; then
        ALERTS+="CRITICAL: $mirror_name has never been synced\n"
        return
    fi

    local last_sync=$(cat "$last_sync_file")
    local last_sync_epoch=$(date -d "$last_sync" +%s)
    local now_epoch=$(date +%s)
    local age_days=$(( (now_epoch - last_sync_epoch) / 86400 ))

    if [[ $age_days -gt $max_age ]]; then
        ALERTS+="WARNING: $mirror_name is $age_days days old (threshold: $max_age)\n"
    fi
}

# Check each mirror
check_mirror "Debian" "$MIRRORS_ROOT/debian/.last-sync" $STALE_THRESHOLD_DAYS
check_mirror "Ubuntu" "$MIRRORS_ROOT/ubuntu/.last-sync" $STALE_THRESHOLD_DAYS
check_mirror "Alpine" "$MIRRORS_ROOT/alpine/.last-sync" $STALE_THRESHOLD_DAYS
check_mirror "OSV" "$MIRRORS_ROOT/osv/.last-sync" 1  # OSV should be hourly

# Check connectivity
for url in \
    "https://snapshot.debian.org" \
    "https://buildinfos.debian.net" \
    "https://ubuntu.com/security/notices.json" \
    "https://secdb.alpinelinux.org"; do

    if ! curl -sSf --connect-timeout 5 "$url" > /dev/null 2>&1; then
        ALERTS+="ERROR: Cannot reach $url\n"
    fi
done

# Report results
if [[ -n "$ALERTS" ]]; then
    echo -e "Golden Corpus Health Issues:\n$ALERTS"
    # Send alert (customize for your alerting system)
    # curl -X POST -d "$ALERTS" https://alerts.example.com/webhook
    exit 1
fi

echo "All mirrors healthy at $(date)"

Monitoring Metrics

Export these metrics to your monitoring system:

Metric Description Alert Threshold
corpus.mirrors.age_seconds Time since last mirror sync > 7 days
corpus.pairs.total Total number of security pairs N/A (info)
corpus.validation.precision Latest precision rate < baseline - 0.01
corpus.validation.recall Latest recall rate < baseline - 0.01
corpus.validation.determinism Deterministic replay rate < 1.0
corpus.bundle.count Number of evidence bundles N/A (info)
corpus.baseline.age_days Days since baseline update > 30 days

Prometheus Metrics Example

# prometheus-corpus-metrics.yaml
groups:
  - name: golden-corpus
    rules:
      - alert: CorpusMirrorStale
        expr: corpus_mirror_age_seconds > 604800  # 7 days
        labels:
          severity: warning
        annotations:
          summary: "Corpus mirror {{ $labels.mirror }} is stale"

      - alert: CorpusRegressionDetected
        expr: corpus_validation_precision < corpus_baseline_precision - 0.01
        labels:
          severity: critical
        annotations:
          summary: "Precision regression detected in golden corpus validation"

      - alert: CorpusDeterminismFailure
        expr: corpus_validation_determinism < 1.0
        labels:
          severity: critical
        annotations:
          summary: "Non-deterministic replay detected"

Cleanup and Archival

Archive Old Results

#!/bin/bash
# archive-old-results.sh
# Archives results older than 90 days

RESULTS_DIR="/data/golden-corpus/bench/results"
ARCHIVE_DIR="/data/golden-corpus/bench/archive"
AGE_DAYS=90

mkdir -p "$ARCHIVE_DIR"

find "$RESULTS_DIR" -name "*.json" -mtime +$AGE_DAYS -exec \
    mv {} "$ARCHIVE_DIR/" \;

# Compress archived results by month
cd "$ARCHIVE_DIR"
for month in $(ls *.json | cut -c1-6 | sort -u); do
    tar -czf "results-$month.tar.gz" "${month}"*.json && \
        rm -f "${month}"*.json
done

Prune Old Baselines

Keep only the last N baselines:

#!/bin/bash
# prune-baselines.sh
# Keeps only the 10 most recent baseline archives

BASELINE_ARCHIVE="/data/golden-corpus/bench/baselines/archive"
KEEP_COUNT=10

cd "$BASELINE_ARCHIVE"
ls -t baseline-*.json | tail -n +$((KEEP_COUNT + 1)) | xargs -r rm -f