save progress

This commit is contained in:
StellaOps Bot
2026-01-06 09:42:02 +02:00
parent 94d68bee8b
commit 37e11918e0
443 changed files with 85863 additions and 897 deletions

377
docs/db/schemas/corpus.sql Normal file
View File

@@ -0,0 +1,377 @@
-- =============================================================================
-- CORPUS SCHEMA - Function Behavior Corpus for Binary Identification
-- Version: V3200_001
-- Sprint: SPRINT_20260105_001_002_BINDEX
-- =============================================================================
-- This schema stores fingerprints of known library functions (similar to
-- Ghidra's BSim/FunctionID) enabling identification of functions in stripped
-- binaries by matching against a large corpus of pre-indexed function behaviors.
-- =============================================================================
CREATE SCHEMA IF NOT EXISTS corpus;
-- =============================================================================
-- HELPER FUNCTIONS
-- =============================================================================
-- Require tenant_id for RLS
CREATE OR REPLACE FUNCTION corpus.require_current_tenant()
RETURNS TEXT LANGUAGE plpgsql STABLE SECURITY DEFINER AS $$
DECLARE v_tenant TEXT;
BEGIN
v_tenant := current_setting('app.tenant_id', true);
IF v_tenant IS NULL OR v_tenant = '' THEN
RAISE EXCEPTION 'app.tenant_id session variable not set';
END IF;
RETURN v_tenant;
END;
$$;
-- =============================================================================
-- LIBRARIES
-- =============================================================================
-- Known libraries tracked in the corpus
CREATE TABLE corpus.libraries (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
name TEXT NOT NULL, -- glibc, openssl, zlib, curl, sqlite
description TEXT,
homepage_url TEXT,
source_repo TEXT, -- git URL for source repository
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (tenant_id, name)
);
CREATE INDEX idx_libraries_tenant ON corpus.libraries(tenant_id);
CREATE INDEX idx_libraries_name ON corpus.libraries(name);
-- Enable RLS
ALTER TABLE corpus.libraries ENABLE ROW LEVEL SECURITY;
CREATE POLICY libraries_tenant_policy ON corpus.libraries
FOR ALL
USING (tenant_id = corpus.require_current_tenant());
-- =============================================================================
-- LIBRARY VERSIONS
-- =============================================================================
-- Library versions indexed in the corpus
CREATE TABLE corpus.library_versions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
library_id UUID NOT NULL REFERENCES corpus.libraries(id) ON DELETE CASCADE,
version TEXT NOT NULL, -- 2.31, 1.1.1n, 1.2.13
release_date DATE,
is_security_release BOOLEAN DEFAULT false,
source_archive_sha256 TEXT, -- Hash of source tarball for provenance
indexed_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (tenant_id, library_id, version)
);
CREATE INDEX idx_library_versions_library ON corpus.library_versions(library_id);
CREATE INDEX idx_library_versions_version ON corpus.library_versions(version);
CREATE INDEX idx_library_versions_tenant ON corpus.library_versions(tenant_id);
ALTER TABLE corpus.library_versions ENABLE ROW LEVEL SECURITY;
CREATE POLICY library_versions_tenant_policy ON corpus.library_versions
FOR ALL
USING (tenant_id = corpus.require_current_tenant());
-- =============================================================================
-- BUILD VARIANTS
-- =============================================================================
-- Architecture/compiler variants of library versions
CREATE TABLE corpus.build_variants (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
library_version_id UUID NOT NULL REFERENCES corpus.library_versions(id) ON DELETE CASCADE,
architecture TEXT NOT NULL, -- x86_64, aarch64, armv7, i686
abi TEXT, -- gnu, musl, msvc
compiler TEXT, -- gcc, clang
compiler_version TEXT,
optimization_level TEXT, -- O0, O2, O3, Os
build_id TEXT, -- ELF Build-ID if available
binary_sha256 TEXT NOT NULL, -- Hash of binary for identity
indexed_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (tenant_id, library_version_id, architecture, abi, compiler, optimization_level)
);
CREATE INDEX idx_build_variants_version ON corpus.build_variants(library_version_id);
CREATE INDEX idx_build_variants_arch ON corpus.build_variants(architecture);
CREATE INDEX idx_build_variants_build_id ON corpus.build_variants(build_id) WHERE build_id IS NOT NULL;
CREATE INDEX idx_build_variants_tenant ON corpus.build_variants(tenant_id);
ALTER TABLE corpus.build_variants ENABLE ROW LEVEL SECURITY;
CREATE POLICY build_variants_tenant_policy ON corpus.build_variants
FOR ALL
USING (tenant_id = corpus.require_current_tenant());
-- =============================================================================
-- FUNCTIONS
-- =============================================================================
-- Functions in the corpus
CREATE TABLE corpus.functions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
build_variant_id UUID NOT NULL REFERENCES corpus.build_variants(id) ON DELETE CASCADE,
name TEXT NOT NULL, -- Function name (may be mangled for C++)
demangled_name TEXT, -- Demangled C++ name
address BIGINT NOT NULL, -- Function address in binary
size_bytes INTEGER NOT NULL, -- Function size
is_exported BOOLEAN DEFAULT false,
is_inline BOOLEAN DEFAULT false,
source_file TEXT, -- Source file if debug info available
source_line INTEGER,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (tenant_id, build_variant_id, name, address)
);
CREATE INDEX idx_functions_variant ON corpus.functions(build_variant_id);
CREATE INDEX idx_functions_name ON corpus.functions(name);
CREATE INDEX idx_functions_demangled ON corpus.functions(demangled_name) WHERE demangled_name IS NOT NULL;
CREATE INDEX idx_functions_exported ON corpus.functions(is_exported) WHERE is_exported = true;
CREATE INDEX idx_functions_tenant ON corpus.functions(tenant_id);
ALTER TABLE corpus.functions ENABLE ROW LEVEL SECURITY;
CREATE POLICY functions_tenant_policy ON corpus.functions
FOR ALL
USING (tenant_id = corpus.require_current_tenant());
-- =============================================================================
-- FINGERPRINTS
-- =============================================================================
-- Function fingerprints (multiple algorithms per function)
CREATE TABLE corpus.fingerprints (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
function_id UUID NOT NULL REFERENCES corpus.functions(id) ON DELETE CASCADE,
algorithm TEXT NOT NULL CHECK (algorithm IN (
'semantic_ksg', -- Key-semantics graph (Phase 1)
'instruction_bb', -- Instruction-level basic block hash
'cfg_wl', -- Control flow graph Weisfeiler-Lehman hash
'api_calls', -- API call sequence hash
'combined' -- Multi-algorithm combined fingerprint
)),
fingerprint BYTEA NOT NULL, -- Variable length depending on algorithm
fingerprint_hex TEXT GENERATED ALWAYS AS (encode(fingerprint, 'hex')) STORED,
metadata JSONB, -- Algorithm-specific metadata
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (tenant_id, function_id, algorithm)
);
-- Indexes for fast fingerprint lookup
CREATE INDEX idx_fingerprints_function ON corpus.fingerprints(function_id);
CREATE INDEX idx_fingerprints_algorithm ON corpus.fingerprints(algorithm);
CREATE INDEX idx_fingerprints_hex ON corpus.fingerprints(algorithm, fingerprint_hex);
CREATE INDEX idx_fingerprints_bytea ON corpus.fingerprints USING hash (fingerprint);
CREATE INDEX idx_fingerprints_tenant ON corpus.fingerprints(tenant_id);
ALTER TABLE corpus.fingerprints ENABLE ROW LEVEL SECURITY;
CREATE POLICY fingerprints_tenant_policy ON corpus.fingerprints
FOR ALL
USING (tenant_id = corpus.require_current_tenant());
-- =============================================================================
-- FUNCTION CLUSTERS
-- =============================================================================
-- Clusters of similar functions across versions
CREATE TABLE corpus.function_clusters (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
library_id UUID NOT NULL REFERENCES corpus.libraries(id) ON DELETE CASCADE,
canonical_name TEXT NOT NULL, -- e.g., "memcpy" across all versions
description TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (tenant_id, library_id, canonical_name)
);
CREATE INDEX idx_function_clusters_library ON corpus.function_clusters(library_id);
CREATE INDEX idx_function_clusters_name ON corpus.function_clusters(canonical_name);
CREATE INDEX idx_function_clusters_tenant ON corpus.function_clusters(tenant_id);
ALTER TABLE corpus.function_clusters ENABLE ROW LEVEL SECURITY;
CREATE POLICY function_clusters_tenant_policy ON corpus.function_clusters
FOR ALL
USING (tenant_id = corpus.require_current_tenant());
-- Cluster membership
CREATE TABLE corpus.cluster_members (
cluster_id UUID NOT NULL REFERENCES corpus.function_clusters(id) ON DELETE CASCADE,
function_id UUID NOT NULL REFERENCES corpus.functions(id) ON DELETE CASCADE,
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
similarity_to_centroid DECIMAL(5,4),
PRIMARY KEY (cluster_id, function_id)
);
CREATE INDEX idx_cluster_members_function ON corpus.cluster_members(function_id);
CREATE INDEX idx_cluster_members_tenant ON corpus.cluster_members(tenant_id);
ALTER TABLE corpus.cluster_members ENABLE ROW LEVEL SECURITY;
CREATE POLICY cluster_members_tenant_policy ON corpus.cluster_members
FOR ALL
USING (tenant_id = corpus.require_current_tenant());
-- =============================================================================
-- CVE ASSOCIATIONS
-- =============================================================================
-- CVE associations for functions
CREATE TABLE corpus.function_cves (
function_id UUID NOT NULL REFERENCES corpus.functions(id) ON DELETE CASCADE,
cve_id TEXT NOT NULL,
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
affected_state TEXT NOT NULL CHECK (affected_state IN (
'vulnerable', 'fixed', 'not_affected'
)),
patch_commit TEXT, -- Git commit that fixed the vulnerability
confidence DECIMAL(3,2) NOT NULL CHECK (confidence >= 0 AND confidence <= 1),
evidence_type TEXT CHECK (evidence_type IN (
'changelog', 'commit', 'advisory', 'patch_header', 'manual'
)),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
PRIMARY KEY (function_id, cve_id)
);
CREATE INDEX idx_function_cves_cve ON corpus.function_cves(cve_id);
CREATE INDEX idx_function_cves_state ON corpus.function_cves(affected_state);
CREATE INDEX idx_function_cves_tenant ON corpus.function_cves(tenant_id);
ALTER TABLE corpus.function_cves ENABLE ROW LEVEL SECURITY;
CREATE POLICY function_cves_tenant_policy ON corpus.function_cves
FOR ALL
USING (tenant_id = corpus.require_current_tenant());
-- =============================================================================
-- INGESTION JOBS
-- =============================================================================
-- Ingestion job tracking
CREATE TABLE corpus.ingestion_jobs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
library_id UUID NOT NULL REFERENCES corpus.libraries(id) ON DELETE CASCADE,
job_type TEXT NOT NULL CHECK (job_type IN (
'full_ingest', 'incremental', 'cve_update'
)),
status TEXT NOT NULL DEFAULT 'pending' CHECK (status IN (
'pending', 'running', 'completed', 'failed', 'cancelled'
)),
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
functions_indexed INTEGER,
fingerprints_generated INTEGER,
clusters_created INTEGER,
errors JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_ingestion_jobs_library ON corpus.ingestion_jobs(library_id);
CREATE INDEX idx_ingestion_jobs_status ON corpus.ingestion_jobs(status);
CREATE INDEX idx_ingestion_jobs_tenant ON corpus.ingestion_jobs(tenant_id);
ALTER TABLE corpus.ingestion_jobs ENABLE ROW LEVEL SECURITY;
CREATE POLICY ingestion_jobs_tenant_policy ON corpus.ingestion_jobs
FOR ALL
USING (tenant_id = corpus.require_current_tenant());
-- =============================================================================
-- VIEWS
-- =============================================================================
-- Library summary view
CREATE OR REPLACE VIEW corpus.library_summary AS
SELECT
l.id,
l.tenant_id,
l.name,
l.description,
COUNT(DISTINCT lv.id) AS version_count,
COUNT(DISTINCT f.id) AS function_count,
COUNT(DISTINCT fc.cve_id) AS cve_count,
MAX(lv.release_date) AS latest_version_date,
l.updated_at
FROM corpus.libraries l
LEFT JOIN corpus.library_versions lv ON lv.library_id = l.id
LEFT JOIN corpus.build_variants bv ON bv.library_version_id = lv.id
LEFT JOIN corpus.functions f ON f.build_variant_id = bv.id
LEFT JOIN corpus.function_cves fc ON fc.function_id = f.id
GROUP BY l.id;
-- Function with full context view
CREATE OR REPLACE VIEW corpus.functions_with_context AS
SELECT
f.id AS function_id,
f.tenant_id,
f.name AS function_name,
f.demangled_name,
f.address,
f.size_bytes,
f.is_exported,
bv.architecture,
bv.abi,
bv.compiler,
bv.optimization_level,
lv.version,
lv.release_date,
l.name AS library_name
FROM corpus.functions f
JOIN corpus.build_variants bv ON bv.id = f.build_variant_id
JOIN corpus.library_versions lv ON lv.id = bv.library_version_id
JOIN corpus.libraries l ON l.id = lv.library_id;
-- =============================================================================
-- STATISTICS FUNCTION
-- =============================================================================
CREATE OR REPLACE FUNCTION corpus.get_statistics()
RETURNS TABLE (
library_count BIGINT,
version_count BIGINT,
build_variant_count BIGINT,
function_count BIGINT,
fingerprint_count BIGINT,
cluster_count BIGINT,
cve_association_count BIGINT,
last_updated TIMESTAMPTZ
) LANGUAGE sql STABLE AS $$
SELECT
(SELECT COUNT(*) FROM corpus.libraries),
(SELECT COUNT(*) FROM corpus.library_versions),
(SELECT COUNT(*) FROM corpus.build_variants),
(SELECT COUNT(*) FROM corpus.functions),
(SELECT COUNT(*) FROM corpus.fingerprints),
(SELECT COUNT(*) FROM corpus.function_clusters),
(SELECT COUNT(*) FROM corpus.function_cves),
(SELECT MAX(created_at) FROM corpus.functions);
$$;
-- =============================================================================
-- COMMENTS
-- =============================================================================
COMMENT ON SCHEMA corpus IS 'Function behavior corpus for binary identification';
COMMENT ON TABLE corpus.libraries IS 'Known libraries tracked in the corpus';
COMMENT ON TABLE corpus.library_versions IS 'Versions of libraries indexed in the corpus';
COMMENT ON TABLE corpus.build_variants IS 'Architecture/compiler variants of library versions';
COMMENT ON TABLE corpus.functions IS 'Functions extracted from build variants';
COMMENT ON TABLE corpus.fingerprints IS 'Fingerprints for function identification (multiple algorithms)';
COMMENT ON TABLE corpus.function_clusters IS 'Clusters of similar functions across versions';
COMMENT ON TABLE corpus.cluster_members IS 'Membership of functions in clusters';
COMMENT ON TABLE corpus.function_cves IS 'CVE associations for functions';
COMMENT ON TABLE corpus.ingestion_jobs IS 'Tracking for corpus ingestion jobs';