Files
git.stella-ops.org/devops/docker/ghidra/scripts/init-bsim.sql

141 lines
4.9 KiB
SQL

-- BSim PostgreSQL Schema Initialization
-- Copyright (c) StellaOps. All rights reserved.
-- Licensed under BUSL-1.1.
--
-- This script creates the core BSim schema structure.
-- Note: Full Ghidra BSim schema is auto-created by Ghidra tools.
-- This provides a minimal functional schema for integration testing.
-- Create schema comment
COMMENT ON DATABASE bsim_corpus IS 'Ghidra BSim function signature database for StellaOps BinaryIndex';
-- Enable required extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS "pg_trgm";
-- BSim executables table
CREATE TABLE IF NOT EXISTS bsim_executables (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
name TEXT NOT NULL,
architecture TEXT NOT NULL,
library_name TEXT,
library_version TEXT,
md5_hash BYTEA,
sha256_hash BYTEA,
date_added TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (sha256_hash)
);
-- BSim functions table
CREATE TABLE IF NOT EXISTS bsim_functions (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
executable_id UUID NOT NULL REFERENCES bsim_executables(id) ON DELETE CASCADE,
name TEXT NOT NULL,
address BIGINT NOT NULL,
flags INTEGER DEFAULT 0,
UNIQUE (executable_id, address)
);
-- BSim function vectors (feature vectors for similarity)
CREATE TABLE IF NOT EXISTS bsim_vectors (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
function_id UUID NOT NULL REFERENCES bsim_functions(id) ON DELETE CASCADE,
lsh_hash BYTEA NOT NULL, -- Locality-sensitive hash
feature_count INTEGER NOT NULL,
vector_data BYTEA NOT NULL, -- Serialized feature vector
UNIQUE (function_id)
);
-- BSim function signatures (compact fingerprints)
CREATE TABLE IF NOT EXISTS bsim_signatures (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
function_id UUID NOT NULL REFERENCES bsim_functions(id) ON DELETE CASCADE,
signature_type TEXT NOT NULL, -- 'basic', 'weighted', 'full'
signature_hash BYTEA NOT NULL,
significance REAL NOT NULL DEFAULT 0.0,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (function_id, signature_type)
);
-- BSim clusters (similar function groups)
CREATE TABLE IF NOT EXISTS bsim_clusters (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
name TEXT,
function_count INTEGER NOT NULL DEFAULT 0,
centroid_vector BYTEA,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
-- Cluster membership
CREATE TABLE IF NOT EXISTS bsim_cluster_members (
cluster_id UUID NOT NULL REFERENCES bsim_clusters(id) ON DELETE CASCADE,
function_id UUID NOT NULL REFERENCES bsim_functions(id) ON DELETE CASCADE,
similarity REAL NOT NULL,
PRIMARY KEY (cluster_id, function_id)
);
-- Ingestion tracking
CREATE TABLE IF NOT EXISTS bsim_ingest_log (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
executable_id UUID REFERENCES bsim_executables(id),
library_name TEXT NOT NULL,
library_version TEXT,
functions_ingested INTEGER NOT NULL DEFAULT 0,
status TEXT NOT NULL DEFAULT 'pending',
error_message TEXT,
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
ingested_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
-- Indexes for efficient querying
CREATE INDEX IF NOT EXISTS idx_bsim_functions_executable ON bsim_functions(executable_id);
CREATE INDEX IF NOT EXISTS idx_bsim_functions_name ON bsim_functions(name);
CREATE INDEX IF NOT EXISTS idx_bsim_vectors_lsh ON bsim_vectors USING hash (lsh_hash);
CREATE INDEX IF NOT EXISTS idx_bsim_signatures_hash ON bsim_signatures USING hash (signature_hash);
CREATE INDEX IF NOT EXISTS idx_bsim_executables_library ON bsim_executables(library_name, library_version);
CREATE INDEX IF NOT EXISTS idx_bsim_ingest_log_status ON bsim_ingest_log(status);
-- Views for common queries
CREATE OR REPLACE VIEW bsim_function_summary AS
SELECT
f.id AS function_id,
f.name AS function_name,
f.address,
e.name AS executable_name,
e.library_name,
e.library_version,
e.architecture,
s.significance
FROM bsim_functions f
JOIN bsim_executables e ON f.executable_id = e.id
LEFT JOIN bsim_signatures s ON f.id = s.function_id AND s.signature_type = 'basic';
CREATE OR REPLACE VIEW bsim_library_stats AS
SELECT
e.library_name,
e.library_version,
COUNT(DISTINCT e.id) AS executable_count,
COUNT(DISTINCT f.id) AS function_count,
MAX(l.ingested_at) AS last_ingested
FROM bsim_executables e
LEFT JOIN bsim_functions f ON e.id = f.executable_id
LEFT JOIN bsim_ingest_log l ON e.id = l.executable_id
WHERE e.library_name IS NOT NULL
GROUP BY e.library_name, e.library_version
ORDER BY e.library_name, e.library_version;
-- Grant permissions
GRANT ALL ON ALL TABLES IN SCHEMA public TO bsim_user;
GRANT ALL ON ALL SEQUENCES IN SCHEMA public TO bsim_user;
-- Insert schema version marker
INSERT INTO bsim_ingest_log (library_name, functions_ingested, status, completed_at)
VALUES ('_schema_init', 0, 'completed', now());
-- Log successful initialization
DO $$
BEGIN
RAISE NOTICE 'BSim schema initialized successfully';
END $$;