using System.Collections.Immutable; using System.Globalization; using System.Text; using StellaOps.Policy; namespace StellaOps.PolicyDsl; /// /// Tokenizes policy DSL source code into a stream of tokens. /// public static class DslTokenizer { public static TokenizerResult Tokenize(string source) { if (source is null) { throw new ArgumentNullException(nameof(source)); } var tokens = ImmutableArray.CreateBuilder(); var diagnostics = ImmutableArray.CreateBuilder(); var index = 0; var line = 1; var column = 1; while (index < source.Length) { var current = source[index]; if (char.IsWhiteSpace(current)) { (index, line, column) = AdvanceWhitespace(source, index, line, column); continue; } if (current == '/' && index + 1 < source.Length) { if (source[index + 1] == '/') { (index, line, column) = SkipSingleLineComment(source, index + 2, line, column + 2); continue; } if (source[index + 1] == '*') { (index, line, column) = SkipMultiLineComment(source, index + 2, line, column + 2, diagnostics); continue; } } var startLocation = new SourceLocation(index, line, column); switch (current) { case '{': tokens.Add(CreateToken(TokenKind.LeftBrace, "{", startLocation, ref index, ref column)); break; case '}': tokens.Add(CreateToken(TokenKind.RightBrace, "}", startLocation, ref index, ref column)); break; case '(': tokens.Add(CreateToken(TokenKind.LeftParen, "(", startLocation, ref index, ref column)); break; case ')': tokens.Add(CreateToken(TokenKind.RightParen, ")", startLocation, ref index, ref column)); break; case '[': tokens.Add(CreateToken(TokenKind.LeftBracket, "[", startLocation, ref index, ref column)); break; case ']': tokens.Add(CreateToken(TokenKind.RightBracket, "]", startLocation, ref index, ref column)); break; case ',': tokens.Add(CreateToken(TokenKind.Comma, ",", startLocation, ref index, ref column)); break; case ';': tokens.Add(CreateToken(TokenKind.Semicolon, ";", startLocation, ref index, ref column)); break; case ':': { if (Match(source, index + 1, '=')) { tokens.Add(CreateToken(TokenKind.Define, ":=", startLocation, ref index, ref column, advance: 2)); } else { tokens.Add(CreateToken(TokenKind.Colon, ":", startLocation, ref index, ref column)); } break; } case '=': { if (Match(source, index + 1, '>')) { tokens.Add(CreateToken(TokenKind.Arrow, "=>", startLocation, ref index, ref column, advance: 2)); } else if (Match(source, index + 1, '=')) { tokens.Add(CreateToken(TokenKind.EqualEqual, "==", startLocation, ref index, ref column, advance: 2)); } else { tokens.Add(CreateToken(TokenKind.Assign, "=", startLocation, ref index, ref column)); } break; } case '!': { if (Match(source, index + 1, '=')) { tokens.Add(CreateToken(TokenKind.NotEqual, "!=", startLocation, ref index, ref column, advance: 2)); } else { ReportUnexpectedCharacter(diagnostics, current, startLocation); index++; column++; } break; } case '<': { if (Match(source, index + 1, '=')) { tokens.Add(CreateToken(TokenKind.LessThanOrEqual, "<=", startLocation, ref index, ref column, advance: 2)); } else { tokens.Add(CreateToken(TokenKind.LessThan, "<", startLocation, ref index, ref column)); } break; } case '>': { if (Match(source, index + 1, '=')) { tokens.Add(CreateToken(TokenKind.GreaterThanOrEqual, ">=", startLocation, ref index, ref column, advance: 2)); } else { tokens.Add(CreateToken(TokenKind.GreaterThan, ">", startLocation, ref index, ref column)); } break; } case '.': tokens.Add(CreateToken(TokenKind.Dot, ".", startLocation, ref index, ref column)); break; case '"': TokenizeString(source, ref index, ref line, ref column, startLocation, tokens, diagnostics); break; case '+': case '-': { if (index + 1 < source.Length && char.IsDigit(source[index + 1])) { TokenizeNumber(source, ref index, ref line, ref column, startLocation, tokens, diagnostics); } else { ReportUnexpectedCharacter(diagnostics, current, startLocation); index++; column++; } break; } default: { if (char.IsDigit(current)) { TokenizeNumber(source, ref index, ref line, ref column, startLocation, tokens, diagnostics); } else if (IsIdentifierStart(current)) { TokenizeIdentifierOrKeyword(source, ref index, ref line, ref column, startLocation, tokens); } else { ReportUnexpectedCharacter(diagnostics, current, startLocation); index++; column++; } break; } } } var eofLocation = new SourceLocation(index, line, column); tokens.Add(new DslToken(TokenKind.EndOfFile, string.Empty, new SourceSpan(eofLocation, eofLocation))); return new TokenizerResult(tokens.ToImmutable(), diagnostics.ToImmutable()); } private static void TokenizeString( string source, ref int index, ref int line, ref int column, SourceLocation start, ImmutableArray.Builder tokens, ImmutableArray.Builder diagnostics) { var builder = new StringBuilder(); var i = index + 1; var currentLine = line; var currentColumn = column + 1; while (i < source.Length) { var ch = source[i]; if (ch == '"') { var end = new SourceLocation(i + 1, currentLine, currentColumn + 1); index = i + 1; column = currentColumn + 1; tokens.Add(new DslToken(TokenKind.StringLiteral, builder.ToString(), new SourceSpan(start, end), builder.ToString())); return; } if (ch == '\\') { if (i + 1 >= source.Length) { diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.UnterminatedString, "Unterminated string literal.", $"@{start.Line}:{start.Column}")); index = source.Length; line = currentLine; column = currentColumn; return; } var escape = source[i + 1]; switch (escape) { case '\\': builder.Append('\\'); break; case '"': builder.Append('"'); break; case 'n': builder.Append('\n'); break; case 'r': builder.Append('\r'); break; case 't': builder.Append('\t'); break; default: diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.InvalidEscapeSequence, $"Invalid escape sequence '\\{escape}'.", $"@{currentLine}:{currentColumn}")); builder.Append(escape); break; } i += 2; currentColumn += 2; continue; } if (ch == '\r' || ch == '\n') { diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.UnterminatedString, "Unterminated string literal.", $"@{start.Line}:{start.Column}")); (index, line, column) = AdvanceWhitespace(source, i, currentLine, currentColumn); return; } builder.Append(ch); i++; currentColumn++; } diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.UnterminatedString, "Unterminated string literal.", $"@{start.Line}:{start.Column}")); index = source.Length; line = currentLine; column = currentColumn; } private static void TokenizeNumber( string source, ref int index, ref int line, ref int column, SourceLocation start, ImmutableArray.Builder tokens, ImmutableArray.Builder diagnostics) { var i = index; var hasDecimal = false; if (source[i] == '+' || source[i] == '-') { i++; } while (i < source.Length) { var ch = source[i]; if (char.IsDigit(ch)) { i++; continue; } if (ch == '.') { if (hasDecimal) { break; } hasDecimal = true; i++; continue; } break; } var percent = false; if (i < source.Length && source[i] == '%') { percent = true; i++; } var text = source.Substring(index, i - index); if (!decimal.TryParse(text, NumberStyles.AllowLeadingSign | NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out var value)) { diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.InvalidNumber, $"Invalid numeric literal '{text}'.", $"@{start.Line}:{start.Column}")); index = i; column += i - index; return; } if (percent) { value /= 100m; } var end = new SourceLocation(i, line, column + (i - index)); tokens.Add(new DslToken(TokenKind.NumberLiteral, text, new SourceSpan(start, end), value)); column += i - index; index = i; } private static void TokenizeIdentifierOrKeyword( string source, ref int index, ref int line, ref int column, SourceLocation start, ImmutableArray.Builder tokens) { var i = index + 1; while (i < source.Length && IsIdentifierPart(source[i])) { i++; } var text = source.Substring(index, i - index); var kind = GetKeywordKind(text); if (kind == TokenKind.BooleanLiteral) { var value = string.Equals(text, "true", StringComparison.Ordinal); var end = new SourceLocation(i, line, column + (i - index)); tokens.Add(new DslToken(TokenKind.BooleanLiteral, text, new SourceSpan(start, end), value)); } else if (kind == TokenKind.Identifier) { var end = new SourceLocation(i, line, column + (i - index)); tokens.Add(new DslToken(TokenKind.Identifier, text, new SourceSpan(start, end))); } else { var end = new SourceLocation(i, line, column + (i - index)); tokens.Add(new DslToken(kind, text, new SourceSpan(start, end))); } column += i - index; index = i; } private static TokenKind GetKeywordKind(string text) { return text switch { "policy" => TokenKind.KeywordPolicy, "syntax" => TokenKind.KeywordSyntax, "metadata" => TokenKind.KeywordMetadata, "profile" => TokenKind.KeywordProfile, "rule" => TokenKind.KeywordRule, "map" => TokenKind.KeywordMap, "source" => TokenKind.KeywordSource, "env" => TokenKind.Identifier, "if" => TokenKind.KeywordIf, "then" => TokenKind.KeywordThen, "when" => TokenKind.KeywordWhen, "and" => TokenKind.KeywordAnd, "or" => TokenKind.KeywordOr, "not" => TokenKind.KeywordNot, "priority" => TokenKind.KeywordPriority, "else" => TokenKind.KeywordElse, "because" => TokenKind.KeywordBecause, "settings" => TokenKind.KeywordSettings, "ignore" => TokenKind.KeywordIgnore, "until" => TokenKind.KeywordUntil, "escalate" => TokenKind.KeywordEscalate, "to" => TokenKind.KeywordTo, "requireVex" => TokenKind.KeywordRequireVex, "warn" => TokenKind.KeywordWarn, "message" => TokenKind.KeywordMessage, "defer" => TokenKind.KeywordDefer, "annotate" => TokenKind.KeywordAnnotate, "in" => TokenKind.KeywordIn, "true" => TokenKind.BooleanLiteral, "false" => TokenKind.BooleanLiteral, _ => TokenKind.Identifier, }; } private static bool IsIdentifierStart(char ch) => char.IsLetter(ch) || ch == '_'; private static bool IsIdentifierPart(char ch) => char.IsLetterOrDigit(ch) || ch == '_' || ch == '-'; private static (int Index, int Line, int Column) AdvanceWhitespace(string source, int index, int line, int column) { var i = index; var currentLine = line; var currentColumn = column; while (i < source.Length) { var ch = source[i]; if (ch == '\r') { if (i + 1 < source.Length && source[i + 1] == '\n') { i += 2; } else { i++; } currentLine++; currentColumn = 1; continue; } if (ch == '\n') { i++; currentLine++; currentColumn = 1; continue; } if (!char.IsWhiteSpace(ch)) { break; } i++; currentColumn++; } return (i, currentLine, currentColumn); } private static (int Index, int Line, int Column) SkipSingleLineComment(string source, int index, int line, int column) { var i = index; var currentLine = line; var currentColumn = column; while (i < source.Length) { var ch = source[i]; if (ch == '\r' || ch == '\n') { return AdvanceWhitespace(source, i, currentLine, currentColumn); } i++; currentColumn++; } return (i, currentLine, currentColumn); } private static (int Index, int Line, int Column) SkipMultiLineComment( string source, int index, int line, int column, ImmutableArray.Builder diagnostics) { var i = index; var currentLine = line; var currentColumn = column; while (i < source.Length) { var ch = source[i]; if (ch == '*' && i + 1 < source.Length && source[i + 1] == '/') { return (i + 2, currentLine, currentColumn + 2); } if (ch == '\r') { if (i + 1 < source.Length && source[i + 1] == '\n') { i += 2; } else { i++; } currentLine++; currentColumn = 1; continue; } if (ch == '\n') { i++; currentLine++; currentColumn = 1; continue; } i++; currentColumn++; } diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.UnexpectedCharacter, "Unterminated comment block.", $"@{line}:{column}")); return (source.Length, currentLine, currentColumn); } private static DslToken CreateToken( TokenKind kind, string text, SourceLocation start, ref int index, ref int column, int advance = 1) { var end = new SourceLocation(index + advance, start.Line, start.Column + advance); index += advance; column += advance; return new DslToken(kind, text, new SourceSpan(start, end)); } private static void ReportUnexpectedCharacter( ImmutableArray.Builder diagnostics, char ch, SourceLocation location) { diagnostics.Add(PolicyIssue.Error( DiagnosticCodes.UnexpectedCharacter, $"Unexpected character '{ch}'.", $"@{location.Line}:{location.Column}")); } private static bool Match(string source, int index, char expected) => index < source.Length && source[index] == expected; } /// /// Result of tokenizing a policy DSL source. /// public readonly record struct TokenizerResult( ImmutableArray Tokens, ImmutableArray Diagnostics);