-- ============================================================================= -- CORPUS SCHEMA - Function Behavior Corpus for Binary Identification -- Version: V3200_001 -- Sprint: SPRINT_20260105_001_002_BINDEX -- ============================================================================= -- This schema stores fingerprints of known library functions (similar to -- Ghidra's BSim/FunctionID) enabling identification of functions in stripped -- binaries by matching against a large corpus of pre-indexed function behaviors. -- ============================================================================= CREATE SCHEMA IF NOT EXISTS corpus; -- ============================================================================= -- HELPER FUNCTIONS -- ============================================================================= -- Require tenant_id for RLS CREATE OR REPLACE FUNCTION corpus.require_current_tenant() RETURNS TEXT LANGUAGE plpgsql STABLE SECURITY DEFINER AS $$ DECLARE v_tenant TEXT; BEGIN v_tenant := current_setting('app.tenant_id', true); IF v_tenant IS NULL OR v_tenant = '' THEN RAISE EXCEPTION 'app.tenant_id session variable not set'; END IF; RETURN v_tenant; END; $$; -- ============================================================================= -- LIBRARIES -- ============================================================================= -- Known libraries tracked in the corpus CREATE TABLE corpus.libraries ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(), name TEXT NOT NULL, -- glibc, openssl, zlib, curl, sqlite description TEXT, homepage_url TEXT, source_repo TEXT, -- git URL for source repository created_at TIMESTAMPTZ NOT NULL DEFAULT now(), updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), UNIQUE (tenant_id, name) ); CREATE INDEX idx_libraries_tenant ON corpus.libraries(tenant_id); CREATE INDEX idx_libraries_name ON corpus.libraries(name); -- Enable RLS ALTER TABLE corpus.libraries ENABLE ROW LEVEL SECURITY; CREATE POLICY libraries_tenant_policy ON corpus.libraries FOR ALL USING (tenant_id = corpus.require_current_tenant()); -- ============================================================================= -- LIBRARY VERSIONS -- ============================================================================= -- Library versions indexed in the corpus CREATE TABLE corpus.library_versions ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(), library_id UUID NOT NULL REFERENCES corpus.libraries(id) ON DELETE CASCADE, version TEXT NOT NULL, -- 2.31, 1.1.1n, 1.2.13 release_date DATE, is_security_release BOOLEAN DEFAULT false, source_archive_sha256 TEXT, -- Hash of source tarball for provenance indexed_at TIMESTAMPTZ NOT NULL DEFAULT now(), UNIQUE (tenant_id, library_id, version) ); CREATE INDEX idx_library_versions_library ON corpus.library_versions(library_id); CREATE INDEX idx_library_versions_version ON corpus.library_versions(version); CREATE INDEX idx_library_versions_tenant ON corpus.library_versions(tenant_id); ALTER TABLE corpus.library_versions ENABLE ROW LEVEL SECURITY; CREATE POLICY library_versions_tenant_policy ON corpus.library_versions FOR ALL USING (tenant_id = corpus.require_current_tenant()); -- ============================================================================= -- BUILD VARIANTS -- ============================================================================= -- Architecture/compiler variants of library versions CREATE TABLE corpus.build_variants ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(), library_version_id UUID NOT NULL REFERENCES corpus.library_versions(id) ON DELETE CASCADE, architecture TEXT NOT NULL, -- x86_64, aarch64, armv7, i686 abi TEXT, -- gnu, musl, msvc compiler TEXT, -- gcc, clang compiler_version TEXT, optimization_level TEXT, -- O0, O2, O3, Os build_id TEXT, -- ELF Build-ID if available binary_sha256 TEXT NOT NULL, -- Hash of binary for identity indexed_at TIMESTAMPTZ NOT NULL DEFAULT now(), UNIQUE (tenant_id, library_version_id, architecture, abi, compiler, optimization_level) ); CREATE INDEX idx_build_variants_version ON corpus.build_variants(library_version_id); CREATE INDEX idx_build_variants_arch ON corpus.build_variants(architecture); CREATE INDEX idx_build_variants_build_id ON corpus.build_variants(build_id) WHERE build_id IS NOT NULL; CREATE INDEX idx_build_variants_tenant ON corpus.build_variants(tenant_id); ALTER TABLE corpus.build_variants ENABLE ROW LEVEL SECURITY; CREATE POLICY build_variants_tenant_policy ON corpus.build_variants FOR ALL USING (tenant_id = corpus.require_current_tenant()); -- ============================================================================= -- FUNCTIONS -- ============================================================================= -- Functions in the corpus CREATE TABLE corpus.functions ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(), build_variant_id UUID NOT NULL REFERENCES corpus.build_variants(id) ON DELETE CASCADE, name TEXT NOT NULL, -- Function name (may be mangled for C++) demangled_name TEXT, -- Demangled C++ name address BIGINT NOT NULL, -- Function address in binary size_bytes INTEGER NOT NULL, -- Function size is_exported BOOLEAN DEFAULT false, is_inline BOOLEAN DEFAULT false, source_file TEXT, -- Source file if debug info available source_line INTEGER, created_at TIMESTAMPTZ NOT NULL DEFAULT now(), UNIQUE (tenant_id, build_variant_id, name, address) ); CREATE INDEX idx_functions_variant ON corpus.functions(build_variant_id); CREATE INDEX idx_functions_name ON corpus.functions(name); CREATE INDEX idx_functions_demangled ON corpus.functions(demangled_name) WHERE demangled_name IS NOT NULL; CREATE INDEX idx_functions_exported ON corpus.functions(is_exported) WHERE is_exported = true; CREATE INDEX idx_functions_tenant ON corpus.functions(tenant_id); ALTER TABLE corpus.functions ENABLE ROW LEVEL SECURITY; CREATE POLICY functions_tenant_policy ON corpus.functions FOR ALL USING (tenant_id = corpus.require_current_tenant()); -- ============================================================================= -- FINGERPRINTS -- ============================================================================= -- Function fingerprints (multiple algorithms per function) CREATE TABLE corpus.fingerprints ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(), function_id UUID NOT NULL REFERENCES corpus.functions(id) ON DELETE CASCADE, algorithm TEXT NOT NULL CHECK (algorithm IN ( 'semantic_ksg', -- Key-semantics graph (Phase 1) 'instruction_bb', -- Instruction-level basic block hash 'cfg_wl', -- Control flow graph Weisfeiler-Lehman hash 'api_calls', -- API call sequence hash 'combined' -- Multi-algorithm combined fingerprint )), fingerprint BYTEA NOT NULL, -- Variable length depending on algorithm fingerprint_hex TEXT GENERATED ALWAYS AS (encode(fingerprint, 'hex')) STORED, metadata JSONB, -- Algorithm-specific metadata created_at TIMESTAMPTZ NOT NULL DEFAULT now(), UNIQUE (tenant_id, function_id, algorithm) ); -- Indexes for fast fingerprint lookup CREATE INDEX idx_fingerprints_function ON corpus.fingerprints(function_id); CREATE INDEX idx_fingerprints_algorithm ON corpus.fingerprints(algorithm); CREATE INDEX idx_fingerprints_hex ON corpus.fingerprints(algorithm, fingerprint_hex); CREATE INDEX idx_fingerprints_bytea ON corpus.fingerprints USING hash (fingerprint); CREATE INDEX idx_fingerprints_tenant ON corpus.fingerprints(tenant_id); ALTER TABLE corpus.fingerprints ENABLE ROW LEVEL SECURITY; CREATE POLICY fingerprints_tenant_policy ON corpus.fingerprints FOR ALL USING (tenant_id = corpus.require_current_tenant()); -- ============================================================================= -- FUNCTION CLUSTERS -- ============================================================================= -- Clusters of similar functions across versions CREATE TABLE corpus.function_clusters ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(), library_id UUID NOT NULL REFERENCES corpus.libraries(id) ON DELETE CASCADE, canonical_name TEXT NOT NULL, -- e.g., "memcpy" across all versions description TEXT, created_at TIMESTAMPTZ NOT NULL DEFAULT now(), UNIQUE (tenant_id, library_id, canonical_name) ); CREATE INDEX idx_function_clusters_library ON corpus.function_clusters(library_id); CREATE INDEX idx_function_clusters_name ON corpus.function_clusters(canonical_name); CREATE INDEX idx_function_clusters_tenant ON corpus.function_clusters(tenant_id); ALTER TABLE corpus.function_clusters ENABLE ROW LEVEL SECURITY; CREATE POLICY function_clusters_tenant_policy ON corpus.function_clusters FOR ALL USING (tenant_id = corpus.require_current_tenant()); -- Cluster membership CREATE TABLE corpus.cluster_members ( cluster_id UUID NOT NULL REFERENCES corpus.function_clusters(id) ON DELETE CASCADE, function_id UUID NOT NULL REFERENCES corpus.functions(id) ON DELETE CASCADE, tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(), similarity_to_centroid DECIMAL(5,4), PRIMARY KEY (cluster_id, function_id) ); CREATE INDEX idx_cluster_members_function ON corpus.cluster_members(function_id); CREATE INDEX idx_cluster_members_tenant ON corpus.cluster_members(tenant_id); ALTER TABLE corpus.cluster_members ENABLE ROW LEVEL SECURITY; CREATE POLICY cluster_members_tenant_policy ON corpus.cluster_members FOR ALL USING (tenant_id = corpus.require_current_tenant()); -- ============================================================================= -- CVE ASSOCIATIONS -- ============================================================================= -- CVE associations for functions CREATE TABLE corpus.function_cves ( function_id UUID NOT NULL REFERENCES corpus.functions(id) ON DELETE CASCADE, cve_id TEXT NOT NULL, tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(), affected_state TEXT NOT NULL CHECK (affected_state IN ( 'vulnerable', 'fixed', 'not_affected' )), patch_commit TEXT, -- Git commit that fixed the vulnerability confidence DECIMAL(3,2) NOT NULL CHECK (confidence >= 0 AND confidence <= 1), evidence_type TEXT CHECK (evidence_type IN ( 'changelog', 'commit', 'advisory', 'patch_header', 'manual' )), created_at TIMESTAMPTZ NOT NULL DEFAULT now(), updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), PRIMARY KEY (function_id, cve_id) ); CREATE INDEX idx_function_cves_cve ON corpus.function_cves(cve_id); CREATE INDEX idx_function_cves_state ON corpus.function_cves(affected_state); CREATE INDEX idx_function_cves_tenant ON corpus.function_cves(tenant_id); ALTER TABLE corpus.function_cves ENABLE ROW LEVEL SECURITY; CREATE POLICY function_cves_tenant_policy ON corpus.function_cves FOR ALL USING (tenant_id = corpus.require_current_tenant()); -- ============================================================================= -- INGESTION JOBS -- ============================================================================= -- Ingestion job tracking CREATE TABLE corpus.ingestion_jobs ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), tenant_id TEXT NOT NULL DEFAULT corpus.require_current_tenant(), library_id UUID NOT NULL REFERENCES corpus.libraries(id) ON DELETE CASCADE, job_type TEXT NOT NULL CHECK (job_type IN ( 'full_ingest', 'incremental', 'cve_update' )), status TEXT NOT NULL DEFAULT 'pending' CHECK (status IN ( 'pending', 'running', 'completed', 'failed', 'cancelled' )), started_at TIMESTAMPTZ, completed_at TIMESTAMPTZ, functions_indexed INTEGER, fingerprints_generated INTEGER, clusters_created INTEGER, errors JSONB, created_at TIMESTAMPTZ NOT NULL DEFAULT now() ); CREATE INDEX idx_ingestion_jobs_library ON corpus.ingestion_jobs(library_id); CREATE INDEX idx_ingestion_jobs_status ON corpus.ingestion_jobs(status); CREATE INDEX idx_ingestion_jobs_tenant ON corpus.ingestion_jobs(tenant_id); ALTER TABLE corpus.ingestion_jobs ENABLE ROW LEVEL SECURITY; CREATE POLICY ingestion_jobs_tenant_policy ON corpus.ingestion_jobs FOR ALL USING (tenant_id = corpus.require_current_tenant()); -- ============================================================================= -- VIEWS -- ============================================================================= -- Library summary view CREATE OR REPLACE VIEW corpus.library_summary AS SELECT l.id, l.tenant_id, l.name, l.description, COUNT(DISTINCT lv.id) AS version_count, COUNT(DISTINCT f.id) AS function_count, COUNT(DISTINCT fc.cve_id) AS cve_count, MAX(lv.release_date) AS latest_version_date, l.updated_at FROM corpus.libraries l LEFT JOIN corpus.library_versions lv ON lv.library_id = l.id LEFT JOIN corpus.build_variants bv ON bv.library_version_id = lv.id LEFT JOIN corpus.functions f ON f.build_variant_id = bv.id LEFT JOIN corpus.function_cves fc ON fc.function_id = f.id GROUP BY l.id; -- Function with full context view CREATE OR REPLACE VIEW corpus.functions_with_context AS SELECT f.id AS function_id, f.tenant_id, f.name AS function_name, f.demangled_name, f.address, f.size_bytes, f.is_exported, bv.architecture, bv.abi, bv.compiler, bv.optimization_level, lv.version, lv.release_date, l.name AS library_name FROM corpus.functions f JOIN corpus.build_variants bv ON bv.id = f.build_variant_id JOIN corpus.library_versions lv ON lv.id = bv.library_version_id JOIN corpus.libraries l ON l.id = lv.library_id; -- ============================================================================= -- STATISTICS FUNCTION -- ============================================================================= CREATE OR REPLACE FUNCTION corpus.get_statistics() RETURNS TABLE ( library_count BIGINT, version_count BIGINT, build_variant_count BIGINT, function_count BIGINT, fingerprint_count BIGINT, cluster_count BIGINT, cve_association_count BIGINT, last_updated TIMESTAMPTZ ) LANGUAGE sql STABLE AS $$ SELECT (SELECT COUNT(*) FROM corpus.libraries), (SELECT COUNT(*) FROM corpus.library_versions), (SELECT COUNT(*) FROM corpus.build_variants), (SELECT COUNT(*) FROM corpus.functions), (SELECT COUNT(*) FROM corpus.fingerprints), (SELECT COUNT(*) FROM corpus.function_clusters), (SELECT COUNT(*) FROM corpus.function_cves), (SELECT MAX(created_at) FROM corpus.functions); $$; -- ============================================================================= -- COMMENTS -- ============================================================================= COMMENT ON SCHEMA corpus IS 'Function behavior corpus for binary identification'; COMMENT ON TABLE corpus.libraries IS 'Known libraries tracked in the corpus'; COMMENT ON TABLE corpus.library_versions IS 'Versions of libraries indexed in the corpus'; COMMENT ON TABLE corpus.build_variants IS 'Architecture/compiler variants of library versions'; COMMENT ON TABLE corpus.functions IS 'Functions extracted from build variants'; COMMENT ON TABLE corpus.fingerprints IS 'Fingerprints for function identification (multiple algorithms)'; COMMENT ON TABLE corpus.function_clusters IS 'Clusters of similar functions across versions'; COMMENT ON TABLE corpus.cluster_members IS 'Membership of functions in clusters'; COMMENT ON TABLE corpus.function_cves IS 'CVE associations for functions'; COMMENT ON TABLE corpus.ingestion_jobs IS 'Tracking for corpus ingestion jobs';