378 lines
16 KiB
PL/PgSQL
378 lines
16 KiB
PL/PgSQL
-- =============================================================================
|
|
-- CORPUS SCHEMA - Function Behavior Corpus for Binary Identification
|
|
-- Version: V3200_001
|
|
-- Sprint: SPRINT_20260105_001_002_BINDEX
|
|
-- =============================================================================
|
|
-- This schema stores fingerprints of known library functions (similar to
|
|
-- Ghidra's BSim/FunctionID) enabling identification of functions in stripped
|
|
-- binaries by matching against a large corpus of pre-indexed function behaviors.
|
|
-- =============================================================================
|
|
|
|
CREATE SCHEMA IF NOT EXISTS corpus;
|
|
|
|
-- =============================================================================
|
|
-- HELPER FUNCTIONS
|
|
-- =============================================================================
|
|
|
|
-- Require tenant_id for RLS
|
|
CREATE OR REPLACE FUNCTION corpus.require_current_tenant()
|
|
RETURNS TEXT LANGUAGE plpgsql STABLE SECURITY DEFINER AS $$
|
|
DECLARE v_tenant TEXT;
|
|
BEGIN
|
|
v_tenant := current_setting('app.tenant_id', true);
|
|
IF v_tenant IS NULL OR v_tenant = '' THEN
|
|
RAISE EXCEPTION 'app.tenant_id session variable not set';
|
|
END IF;
|
|
RETURN v_tenant;
|
|
END;
|
|
$$;
|
|
|
|
-- =============================================================================
|
|
-- LIBRARIES
|
|
-- =============================================================================
|
|
|
|
-- Known libraries tracked in the corpus
|
|
CREATE TABLE corpus.libraries (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
|
|
name TEXT NOT NULL, -- glibc, openssl, zlib, curl, sqlite
|
|
description TEXT,
|
|
homepage_url TEXT,
|
|
source_repo TEXT, -- git URL for source repository
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
UNIQUE (tenant_id, name)
|
|
);
|
|
|
|
CREATE INDEX idx_libraries_tenant ON corpus.libraries(tenant_id);
|
|
CREATE INDEX idx_libraries_name ON corpus.libraries(name);
|
|
|
|
-- Enable RLS
|
|
ALTER TABLE corpus.libraries ENABLE ROW LEVEL SECURITY;
|
|
|
|
CREATE POLICY libraries_tenant_policy ON corpus.libraries
|
|
FOR ALL
|
|
USING (tenant_id = corpus.require_current_tenant());
|
|
|
|
-- =============================================================================
|
|
-- LIBRARY VERSIONS
|
|
-- =============================================================================
|
|
|
|
-- Library versions indexed in the corpus
|
|
CREATE TABLE corpus.library_versions (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
|
|
library_id UUID NOT NULL REFERENCES corpus.libraries(id) ON DELETE CASCADE,
|
|
version TEXT NOT NULL, -- 2.31, 1.1.1n, 1.2.13
|
|
release_date DATE,
|
|
is_security_release BOOLEAN DEFAULT false,
|
|
source_archive_sha256 TEXT, -- Hash of source tarball for provenance
|
|
indexed_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
UNIQUE (tenant_id, library_id, version)
|
|
);
|
|
|
|
CREATE INDEX idx_library_versions_library ON corpus.library_versions(library_id);
|
|
CREATE INDEX idx_library_versions_version ON corpus.library_versions(version);
|
|
CREATE INDEX idx_library_versions_tenant ON corpus.library_versions(tenant_id);
|
|
|
|
ALTER TABLE corpus.library_versions ENABLE ROW LEVEL SECURITY;
|
|
|
|
CREATE POLICY library_versions_tenant_policy ON corpus.library_versions
|
|
FOR ALL
|
|
USING (tenant_id = corpus.require_current_tenant());
|
|
|
|
-- =============================================================================
|
|
-- BUILD VARIANTS
|
|
-- =============================================================================
|
|
|
|
-- Architecture/compiler variants of library versions
|
|
CREATE TABLE corpus.build_variants (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
|
|
library_version_id UUID NOT NULL REFERENCES corpus.library_versions(id) ON DELETE CASCADE,
|
|
architecture TEXT NOT NULL, -- x86_64, aarch64, armv7, i686
|
|
abi TEXT, -- gnu, musl, msvc
|
|
compiler TEXT, -- gcc, clang
|
|
compiler_version TEXT,
|
|
optimization_level TEXT, -- O0, O2, O3, Os
|
|
build_id TEXT, -- ELF Build-ID if available
|
|
binary_sha256 TEXT NOT NULL, -- Hash of binary for identity
|
|
indexed_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
UNIQUE (tenant_id, library_version_id, architecture, abi, compiler, optimization_level)
|
|
);
|
|
|
|
CREATE INDEX idx_build_variants_version ON corpus.build_variants(library_version_id);
|
|
CREATE INDEX idx_build_variants_arch ON corpus.build_variants(architecture);
|
|
CREATE INDEX idx_build_variants_build_id ON corpus.build_variants(build_id) WHERE build_id IS NOT NULL;
|
|
CREATE INDEX idx_build_variants_tenant ON corpus.build_variants(tenant_id);
|
|
|
|
ALTER TABLE corpus.build_variants ENABLE ROW LEVEL SECURITY;
|
|
|
|
CREATE POLICY build_variants_tenant_policy ON corpus.build_variants
|
|
FOR ALL
|
|
USING (tenant_id = corpus.require_current_tenant());
|
|
|
|
-- =============================================================================
|
|
-- FUNCTIONS
|
|
-- =============================================================================
|
|
|
|
-- Functions in the corpus
|
|
CREATE TABLE corpus.functions (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
|
|
build_variant_id UUID NOT NULL REFERENCES corpus.build_variants(id) ON DELETE CASCADE,
|
|
name TEXT NOT NULL, -- Function name (may be mangled for C++)
|
|
demangled_name TEXT, -- Demangled C++ name
|
|
address BIGINT NOT NULL, -- Function address in binary
|
|
size_bytes INTEGER NOT NULL, -- Function size
|
|
is_exported BOOLEAN DEFAULT false,
|
|
is_inline BOOLEAN DEFAULT false,
|
|
source_file TEXT, -- Source file if debug info available
|
|
source_line INTEGER,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
UNIQUE (tenant_id, build_variant_id, name, address)
|
|
);
|
|
|
|
CREATE INDEX idx_functions_variant ON corpus.functions(build_variant_id);
|
|
CREATE INDEX idx_functions_name ON corpus.functions(name);
|
|
CREATE INDEX idx_functions_demangled ON corpus.functions(demangled_name) WHERE demangled_name IS NOT NULL;
|
|
CREATE INDEX idx_functions_exported ON corpus.functions(is_exported) WHERE is_exported = true;
|
|
CREATE INDEX idx_functions_tenant ON corpus.functions(tenant_id);
|
|
|
|
ALTER TABLE corpus.functions ENABLE ROW LEVEL SECURITY;
|
|
|
|
CREATE POLICY functions_tenant_policy ON corpus.functions
|
|
FOR ALL
|
|
USING (tenant_id = corpus.require_current_tenant());
|
|
|
|
-- =============================================================================
|
|
-- FINGERPRINTS
|
|
-- =============================================================================
|
|
|
|
-- Function fingerprints (multiple algorithms per function)
|
|
CREATE TABLE corpus.fingerprints (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
|
|
function_id UUID NOT NULL REFERENCES corpus.functions(id) ON DELETE CASCADE,
|
|
algorithm TEXT NOT NULL CHECK (algorithm IN (
|
|
'semantic_ksg', -- Key-semantics graph (Phase 1)
|
|
'instruction_bb', -- Instruction-level basic block hash
|
|
'cfg_wl', -- Control flow graph Weisfeiler-Lehman hash
|
|
'api_calls', -- API call sequence hash
|
|
'combined' -- Multi-algorithm combined fingerprint
|
|
)),
|
|
fingerprint BYTEA NOT NULL, -- Variable length depending on algorithm
|
|
fingerprint_hex TEXT GENERATED ALWAYS AS (encode(fingerprint, 'hex')) STORED,
|
|
metadata JSONB, -- Algorithm-specific metadata
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
UNIQUE (tenant_id, function_id, algorithm)
|
|
);
|
|
|
|
-- Indexes for fast fingerprint lookup
|
|
CREATE INDEX idx_fingerprints_function ON corpus.fingerprints(function_id);
|
|
CREATE INDEX idx_fingerprints_algorithm ON corpus.fingerprints(algorithm);
|
|
CREATE INDEX idx_fingerprints_hex ON corpus.fingerprints(algorithm, fingerprint_hex);
|
|
CREATE INDEX idx_fingerprints_bytea ON corpus.fingerprints USING hash (fingerprint);
|
|
CREATE INDEX idx_fingerprints_tenant ON corpus.fingerprints(tenant_id);
|
|
|
|
ALTER TABLE corpus.fingerprints ENABLE ROW LEVEL SECURITY;
|
|
|
|
CREATE POLICY fingerprints_tenant_policy ON corpus.fingerprints
|
|
FOR ALL
|
|
USING (tenant_id = corpus.require_current_tenant());
|
|
|
|
-- =============================================================================
|
|
-- FUNCTION CLUSTERS
|
|
-- =============================================================================
|
|
|
|
-- Clusters of similar functions across versions
|
|
CREATE TABLE corpus.function_clusters (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
|
|
library_id UUID NOT NULL REFERENCES corpus.libraries(id) ON DELETE CASCADE,
|
|
canonical_name TEXT NOT NULL, -- e.g., "memcpy" across all versions
|
|
description TEXT,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
UNIQUE (tenant_id, library_id, canonical_name)
|
|
);
|
|
|
|
CREATE INDEX idx_function_clusters_library ON corpus.function_clusters(library_id);
|
|
CREATE INDEX idx_function_clusters_name ON corpus.function_clusters(canonical_name);
|
|
CREATE INDEX idx_function_clusters_tenant ON corpus.function_clusters(tenant_id);
|
|
|
|
ALTER TABLE corpus.function_clusters ENABLE ROW LEVEL SECURITY;
|
|
|
|
CREATE POLICY function_clusters_tenant_policy ON corpus.function_clusters
|
|
FOR ALL
|
|
USING (tenant_id = corpus.require_current_tenant());
|
|
|
|
-- Cluster membership
|
|
CREATE TABLE corpus.cluster_members (
|
|
cluster_id UUID NOT NULL REFERENCES corpus.function_clusters(id) ON DELETE CASCADE,
|
|
function_id UUID NOT NULL REFERENCES corpus.functions(id) ON DELETE CASCADE,
|
|
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
|
|
similarity_to_centroid DECIMAL(5,4),
|
|
PRIMARY KEY (cluster_id, function_id)
|
|
);
|
|
|
|
CREATE INDEX idx_cluster_members_function ON corpus.cluster_members(function_id);
|
|
CREATE INDEX idx_cluster_members_tenant ON corpus.cluster_members(tenant_id);
|
|
|
|
ALTER TABLE corpus.cluster_members ENABLE ROW LEVEL SECURITY;
|
|
|
|
CREATE POLICY cluster_members_tenant_policy ON corpus.cluster_members
|
|
FOR ALL
|
|
USING (tenant_id = corpus.require_current_tenant());
|
|
|
|
-- =============================================================================
|
|
-- CVE ASSOCIATIONS
|
|
-- =============================================================================
|
|
|
|
-- CVE associations for functions
|
|
CREATE TABLE corpus.function_cves (
|
|
function_id UUID NOT NULL REFERENCES corpus.functions(id) ON DELETE CASCADE,
|
|
cve_id TEXT NOT NULL,
|
|
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
|
|
affected_state TEXT NOT NULL CHECK (affected_state IN (
|
|
'vulnerable', 'fixed', 'not_affected'
|
|
)),
|
|
patch_commit TEXT, -- Git commit that fixed the vulnerability
|
|
confidence DECIMAL(3,2) NOT NULL CHECK (confidence >= 0 AND confidence <= 1),
|
|
evidence_type TEXT CHECK (evidence_type IN (
|
|
'changelog', 'commit', 'advisory', 'patch_header', 'manual'
|
|
)),
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
PRIMARY KEY (function_id, cve_id)
|
|
);
|
|
|
|
CREATE INDEX idx_function_cves_cve ON corpus.function_cves(cve_id);
|
|
CREATE INDEX idx_function_cves_state ON corpus.function_cves(affected_state);
|
|
CREATE INDEX idx_function_cves_tenant ON corpus.function_cves(tenant_id);
|
|
|
|
ALTER TABLE corpus.function_cves ENABLE ROW LEVEL SECURITY;
|
|
|
|
CREATE POLICY function_cves_tenant_policy ON corpus.function_cves
|
|
FOR ALL
|
|
USING (tenant_id = corpus.require_current_tenant());
|
|
|
|
-- =============================================================================
|
|
-- INGESTION JOBS
|
|
-- =============================================================================
|
|
|
|
-- Ingestion job tracking
|
|
CREATE TABLE corpus.ingestion_jobs (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(),
|
|
library_id UUID NOT NULL REFERENCES corpus.libraries(id) ON DELETE CASCADE,
|
|
job_type TEXT NOT NULL CHECK (job_type IN (
|
|
'full_ingest', 'incremental', 'cve_update'
|
|
)),
|
|
status TEXT NOT NULL DEFAULT 'pending' CHECK (status IN (
|
|
'pending', 'running', 'completed', 'failed', 'cancelled'
|
|
)),
|
|
started_at TIMESTAMPTZ,
|
|
completed_at TIMESTAMPTZ,
|
|
functions_indexed INTEGER,
|
|
fingerprints_generated INTEGER,
|
|
clusters_created INTEGER,
|
|
errors JSONB,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
);
|
|
|
|
CREATE INDEX idx_ingestion_jobs_library ON corpus.ingestion_jobs(library_id);
|
|
CREATE INDEX idx_ingestion_jobs_status ON corpus.ingestion_jobs(status);
|
|
CREATE INDEX idx_ingestion_jobs_tenant ON corpus.ingestion_jobs(tenant_id);
|
|
|
|
ALTER TABLE corpus.ingestion_jobs ENABLE ROW LEVEL SECURITY;
|
|
|
|
CREATE POLICY ingestion_jobs_tenant_policy ON corpus.ingestion_jobs
|
|
FOR ALL
|
|
USING (tenant_id = corpus.require_current_tenant());
|
|
|
|
-- =============================================================================
|
|
-- VIEWS
|
|
-- =============================================================================
|
|
|
|
-- Library summary view
|
|
CREATE OR REPLACE VIEW corpus.library_summary AS
|
|
SELECT
|
|
l.id,
|
|
l.tenant_id,
|
|
l.name,
|
|
l.description,
|
|
COUNT(DISTINCT lv.id) AS version_count,
|
|
COUNT(DISTINCT f.id) AS function_count,
|
|
COUNT(DISTINCT fc.cve_id) AS cve_count,
|
|
MAX(lv.release_date) AS latest_version_date,
|
|
l.updated_at
|
|
FROM corpus.libraries l
|
|
LEFT JOIN corpus.library_versions lv ON lv.library_id = l.id
|
|
LEFT JOIN corpus.build_variants bv ON bv.library_version_id = lv.id
|
|
LEFT JOIN corpus.functions f ON f.build_variant_id = bv.id
|
|
LEFT JOIN corpus.function_cves fc ON fc.function_id = f.id
|
|
GROUP BY l.id;
|
|
|
|
-- Function with full context view
|
|
CREATE OR REPLACE VIEW corpus.functions_with_context AS
|
|
SELECT
|
|
f.id AS function_id,
|
|
f.tenant_id,
|
|
f.name AS function_name,
|
|
f.demangled_name,
|
|
f.address,
|
|
f.size_bytes,
|
|
f.is_exported,
|
|
bv.architecture,
|
|
bv.abi,
|
|
bv.compiler,
|
|
bv.optimization_level,
|
|
lv.version,
|
|
lv.release_date,
|
|
l.name AS library_name
|
|
FROM corpus.functions f
|
|
JOIN corpus.build_variants bv ON bv.id = f.build_variant_id
|
|
JOIN corpus.library_versions lv ON lv.id = bv.library_version_id
|
|
JOIN corpus.libraries l ON l.id = lv.library_id;
|
|
|
|
-- =============================================================================
|
|
-- STATISTICS FUNCTION
|
|
-- =============================================================================
|
|
|
|
CREATE OR REPLACE FUNCTION corpus.get_statistics()
|
|
RETURNS TABLE (
|
|
library_count BIGINT,
|
|
version_count BIGINT,
|
|
build_variant_count BIGINT,
|
|
function_count BIGINT,
|
|
fingerprint_count BIGINT,
|
|
cluster_count BIGINT,
|
|
cve_association_count BIGINT,
|
|
last_updated TIMESTAMPTZ
|
|
) LANGUAGE sql STABLE AS $$
|
|
SELECT
|
|
(SELECT COUNT(*) FROM corpus.libraries),
|
|
(SELECT COUNT(*) FROM corpus.library_versions),
|
|
(SELECT COUNT(*) FROM corpus.build_variants),
|
|
(SELECT COUNT(*) FROM corpus.functions),
|
|
(SELECT COUNT(*) FROM corpus.fingerprints),
|
|
(SELECT COUNT(*) FROM corpus.function_clusters),
|
|
(SELECT COUNT(*) FROM corpus.function_cves),
|
|
(SELECT MAX(created_at) FROM corpus.functions);
|
|
$$;
|
|
|
|
-- =============================================================================
|
|
-- COMMENTS
|
|
-- =============================================================================
|
|
|
|
COMMENT ON SCHEMA corpus IS 'Function behavior corpus for binary identification';
|
|
COMMENT ON TABLE corpus.libraries IS 'Known libraries tracked in the corpus';
|
|
COMMENT ON TABLE corpus.library_versions IS 'Versions of libraries indexed in the corpus';
|
|
COMMENT ON TABLE corpus.build_variants IS 'Architecture/compiler variants of library versions';
|
|
COMMENT ON TABLE corpus.functions IS 'Functions extracted from build variants';
|
|
COMMENT ON TABLE corpus.fingerprints IS 'Fingerprints for function identification (multiple algorithms)';
|
|
COMMENT ON TABLE corpus.function_clusters IS 'Clusters of similar functions across versions';
|
|
COMMENT ON TABLE corpus.cluster_members IS 'Membership of functions in clusters';
|
|
COMMENT ON TABLE corpus.function_cves IS 'CVE associations for functions';
|
|
COMMENT ON TABLE corpus.ingestion_jobs IS 'Tracking for corpus ingestion jobs';
|