141 lines
4.9 KiB
SQL
141 lines
4.9 KiB
SQL
-- BSim PostgreSQL Schema Initialization
|
|
-- Copyright (c) StellaOps. All rights reserved.
|
|
-- Licensed under AGPL-3.0-or-later.
|
|
--
|
|
-- This script creates the core BSim schema structure.
|
|
-- Note: Full Ghidra BSim schema is auto-created by Ghidra tools.
|
|
-- This provides a minimal functional schema for integration testing.
|
|
|
|
-- Create schema comment
|
|
COMMENT ON DATABASE bsim_corpus IS 'Ghidra BSim function signature database for StellaOps BinaryIndex';
|
|
|
|
-- Enable required extensions
|
|
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
|
CREATE EXTENSION IF NOT EXISTS "pg_trgm";
|
|
|
|
-- BSim executables table
|
|
CREATE TABLE IF NOT EXISTS bsim_executables (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
name TEXT NOT NULL,
|
|
architecture TEXT NOT NULL,
|
|
library_name TEXT,
|
|
library_version TEXT,
|
|
md5_hash BYTEA,
|
|
sha256_hash BYTEA,
|
|
date_added TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
UNIQUE (sha256_hash)
|
|
);
|
|
|
|
-- BSim functions table
|
|
CREATE TABLE IF NOT EXISTS bsim_functions (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
executable_id UUID NOT NULL REFERENCES bsim_executables(id) ON DELETE CASCADE,
|
|
name TEXT NOT NULL,
|
|
address BIGINT NOT NULL,
|
|
flags INTEGER DEFAULT 0,
|
|
UNIQUE (executable_id, address)
|
|
);
|
|
|
|
-- BSim function vectors (feature vectors for similarity)
|
|
CREATE TABLE IF NOT EXISTS bsim_vectors (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
function_id UUID NOT NULL REFERENCES bsim_functions(id) ON DELETE CASCADE,
|
|
lsh_hash BYTEA NOT NULL, -- Locality-sensitive hash
|
|
feature_count INTEGER NOT NULL,
|
|
vector_data BYTEA NOT NULL, -- Serialized feature vector
|
|
UNIQUE (function_id)
|
|
);
|
|
|
|
-- BSim function signatures (compact fingerprints)
|
|
CREATE TABLE IF NOT EXISTS bsim_signatures (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
function_id UUID NOT NULL REFERENCES bsim_functions(id) ON DELETE CASCADE,
|
|
signature_type TEXT NOT NULL, -- 'basic', 'weighted', 'full'
|
|
signature_hash BYTEA NOT NULL,
|
|
significance REAL NOT NULL DEFAULT 0.0,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
UNIQUE (function_id, signature_type)
|
|
);
|
|
|
|
-- BSim clusters (similar function groups)
|
|
CREATE TABLE IF NOT EXISTS bsim_clusters (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
name TEXT,
|
|
function_count INTEGER NOT NULL DEFAULT 0,
|
|
centroid_vector BYTEA,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
);
|
|
|
|
-- Cluster membership
|
|
CREATE TABLE IF NOT EXISTS bsim_cluster_members (
|
|
cluster_id UUID NOT NULL REFERENCES bsim_clusters(id) ON DELETE CASCADE,
|
|
function_id UUID NOT NULL REFERENCES bsim_functions(id) ON DELETE CASCADE,
|
|
similarity REAL NOT NULL,
|
|
PRIMARY KEY (cluster_id, function_id)
|
|
);
|
|
|
|
-- Ingestion tracking
|
|
CREATE TABLE IF NOT EXISTS bsim_ingest_log (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
executable_id UUID REFERENCES bsim_executables(id),
|
|
library_name TEXT NOT NULL,
|
|
library_version TEXT,
|
|
functions_ingested INTEGER NOT NULL DEFAULT 0,
|
|
status TEXT NOT NULL DEFAULT 'pending',
|
|
error_message TEXT,
|
|
started_at TIMESTAMPTZ,
|
|
completed_at TIMESTAMPTZ,
|
|
ingested_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
);
|
|
|
|
-- Indexes for efficient querying
|
|
CREATE INDEX IF NOT EXISTS idx_bsim_functions_executable ON bsim_functions(executable_id);
|
|
CREATE INDEX IF NOT EXISTS idx_bsim_functions_name ON bsim_functions(name);
|
|
CREATE INDEX IF NOT EXISTS idx_bsim_vectors_lsh ON bsim_vectors USING hash (lsh_hash);
|
|
CREATE INDEX IF NOT EXISTS idx_bsim_signatures_hash ON bsim_signatures USING hash (signature_hash);
|
|
CREATE INDEX IF NOT EXISTS idx_bsim_executables_library ON bsim_executables(library_name, library_version);
|
|
CREATE INDEX IF NOT EXISTS idx_bsim_ingest_log_status ON bsim_ingest_log(status);
|
|
|
|
-- Views for common queries
|
|
CREATE OR REPLACE VIEW bsim_function_summary AS
|
|
SELECT
|
|
f.id AS function_id,
|
|
f.name AS function_name,
|
|
f.address,
|
|
e.name AS executable_name,
|
|
e.library_name,
|
|
e.library_version,
|
|
e.architecture,
|
|
s.significance
|
|
FROM bsim_functions f
|
|
JOIN bsim_executables e ON f.executable_id = e.id
|
|
LEFT JOIN bsim_signatures s ON f.id = s.function_id AND s.signature_type = 'basic';
|
|
|
|
CREATE OR REPLACE VIEW bsim_library_stats AS
|
|
SELECT
|
|
e.library_name,
|
|
e.library_version,
|
|
COUNT(DISTINCT e.id) AS executable_count,
|
|
COUNT(DISTINCT f.id) AS function_count,
|
|
MAX(l.ingested_at) AS last_ingested
|
|
FROM bsim_executables e
|
|
LEFT JOIN bsim_functions f ON e.id = f.executable_id
|
|
LEFT JOIN bsim_ingest_log l ON e.id = l.executable_id
|
|
WHERE e.library_name IS NOT NULL
|
|
GROUP BY e.library_name, e.library_version
|
|
ORDER BY e.library_name, e.library_version;
|
|
|
|
-- Grant permissions
|
|
GRANT ALL ON ALL TABLES IN SCHEMA public TO bsim_user;
|
|
GRANT ALL ON ALL SEQUENCES IN SCHEMA public TO bsim_user;
|
|
|
|
-- Insert schema version marker
|
|
INSERT INTO bsim_ingest_log (library_name, functions_ingested, status, completed_at)
|
|
VALUES ('_schema_init', 0, 'completed', now());
|
|
|
|
-- Log successful initialization
|
|
DO $$
|
|
BEGIN
|
|
RAISE NOTICE 'BSim schema initialized successfully';
|
|
END $$;
|