Files
git.stella-ops.org/src/Policy/StellaOps.PolicyDsl/DslTokenizer.cs
StellaOps Bot 8abbf9574d up
2025-11-27 21:10:06 +02:00

583 lines
19 KiB
C#

using System.Collections.Immutable;
using System.Globalization;
using System.Text;
using StellaOps.Policy;
namespace StellaOps.PolicyDsl;
/// <summary>
/// Tokenizes policy DSL source code into a stream of tokens.
/// </summary>
public static class DslTokenizer
{
public static TokenizerResult Tokenize(string source)
{
if (source is null)
{
throw new ArgumentNullException(nameof(source));
}
var tokens = ImmutableArray.CreateBuilder<DslToken>();
var diagnostics = ImmutableArray.CreateBuilder<PolicyIssue>();
var index = 0;
var line = 1;
var column = 1;
while (index < source.Length)
{
var current = source[index];
if (char.IsWhiteSpace(current))
{
(index, line, column) = AdvanceWhitespace(source, index, line, column);
continue;
}
if (current == '/' && index + 1 < source.Length)
{
if (source[index + 1] == '/')
{
(index, line, column) = SkipSingleLineComment(source, index + 2, line, column + 2);
continue;
}
if (source[index + 1] == '*')
{
(index, line, column) = SkipMultiLineComment(source, index + 2, line, column + 2, diagnostics);
continue;
}
}
var startLocation = new SourceLocation(index, line, column);
switch (current)
{
case '{':
tokens.Add(CreateToken(TokenKind.LeftBrace, "{", startLocation, ref index, ref column));
break;
case '}':
tokens.Add(CreateToken(TokenKind.RightBrace, "}", startLocation, ref index, ref column));
break;
case '(':
tokens.Add(CreateToken(TokenKind.LeftParen, "(", startLocation, ref index, ref column));
break;
case ')':
tokens.Add(CreateToken(TokenKind.RightParen, ")", startLocation, ref index, ref column));
break;
case '[':
tokens.Add(CreateToken(TokenKind.LeftBracket, "[", startLocation, ref index, ref column));
break;
case ']':
tokens.Add(CreateToken(TokenKind.RightBracket, "]", startLocation, ref index, ref column));
break;
case ',':
tokens.Add(CreateToken(TokenKind.Comma, ",", startLocation, ref index, ref column));
break;
case ';':
tokens.Add(CreateToken(TokenKind.Semicolon, ";", startLocation, ref index, ref column));
break;
case ':':
{
if (Match(source, index + 1, '='))
{
tokens.Add(CreateToken(TokenKind.Define, ":=", startLocation, ref index, ref column, advance: 2));
}
else
{
tokens.Add(CreateToken(TokenKind.Colon, ":", startLocation, ref index, ref column));
}
break;
}
case '=':
{
if (Match(source, index + 1, '>'))
{
tokens.Add(CreateToken(TokenKind.Arrow, "=>", startLocation, ref index, ref column, advance: 2));
}
else if (Match(source, index + 1, '='))
{
tokens.Add(CreateToken(TokenKind.EqualEqual, "==", startLocation, ref index, ref column, advance: 2));
}
else
{
tokens.Add(CreateToken(TokenKind.Assign, "=", startLocation, ref index, ref column));
}
break;
}
case '!':
{
if (Match(source, index + 1, '='))
{
tokens.Add(CreateToken(TokenKind.NotEqual, "!=", startLocation, ref index, ref column, advance: 2));
}
else
{
ReportUnexpectedCharacter(diagnostics, current, startLocation);
index++;
column++;
}
break;
}
case '<':
{
if (Match(source, index + 1, '='))
{
tokens.Add(CreateToken(TokenKind.LessThanOrEqual, "<=", startLocation, ref index, ref column, advance: 2));
}
else
{
tokens.Add(CreateToken(TokenKind.LessThan, "<", startLocation, ref index, ref column));
}
break;
}
case '>':
{
if (Match(source, index + 1, '='))
{
tokens.Add(CreateToken(TokenKind.GreaterThanOrEqual, ">=", startLocation, ref index, ref column, advance: 2));
}
else
{
tokens.Add(CreateToken(TokenKind.GreaterThan, ">", startLocation, ref index, ref column));
}
break;
}
case '.':
tokens.Add(CreateToken(TokenKind.Dot, ".", startLocation, ref index, ref column));
break;
case '"':
TokenizeString(source, ref index, ref line, ref column, startLocation, tokens, diagnostics);
break;
case '+':
case '-':
{
if (index + 1 < source.Length && char.IsDigit(source[index + 1]))
{
TokenizeNumber(source, ref index, ref line, ref column, startLocation, tokens, diagnostics);
}
else
{
ReportUnexpectedCharacter(diagnostics, current, startLocation);
index++;
column++;
}
break;
}
default:
{
if (char.IsDigit(current))
{
TokenizeNumber(source, ref index, ref line, ref column, startLocation, tokens, diagnostics);
}
else if (IsIdentifierStart(current))
{
TokenizeIdentifierOrKeyword(source, ref index, ref line, ref column, startLocation, tokens);
}
else
{
ReportUnexpectedCharacter(diagnostics, current, startLocation);
index++;
column++;
}
break;
}
}
}
var eofLocation = new SourceLocation(index, line, column);
tokens.Add(new DslToken(TokenKind.EndOfFile, string.Empty, new SourceSpan(eofLocation, eofLocation)));
return new TokenizerResult(tokens.ToImmutable(), diagnostics.ToImmutable());
}
private static void TokenizeString(
string source,
ref int index,
ref int line,
ref int column,
SourceLocation start,
ImmutableArray<DslToken>.Builder tokens,
ImmutableArray<PolicyIssue>.Builder diagnostics)
{
var builder = new StringBuilder();
var i = index + 1;
var currentLine = line;
var currentColumn = column + 1;
while (i < source.Length)
{
var ch = source[i];
if (ch == '"')
{
var end = new SourceLocation(i + 1, currentLine, currentColumn + 1);
index = i + 1;
column = currentColumn + 1;
tokens.Add(new DslToken(TokenKind.StringLiteral, builder.ToString(), new SourceSpan(start, end), builder.ToString()));
return;
}
if (ch == '\\')
{
if (i + 1 >= source.Length)
{
diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.UnterminatedString, "Unterminated string literal.", $"@{start.Line}:{start.Column}"));
index = source.Length;
line = currentLine;
column = currentColumn;
return;
}
var escape = source[i + 1];
switch (escape)
{
case '\\':
builder.Append('\\');
break;
case '"':
builder.Append('"');
break;
case 'n':
builder.Append('\n');
break;
case 'r':
builder.Append('\r');
break;
case 't':
builder.Append('\t');
break;
default:
diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.InvalidEscapeSequence, $"Invalid escape sequence '\\{escape}'.", $"@{currentLine}:{currentColumn}"));
builder.Append(escape);
break;
}
i += 2;
currentColumn += 2;
continue;
}
if (ch == '\r' || ch == '\n')
{
diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.UnterminatedString, "Unterminated string literal.", $"@{start.Line}:{start.Column}"));
(index, line, column) = AdvanceWhitespace(source, i, currentLine, currentColumn);
return;
}
builder.Append(ch);
i++;
currentColumn++;
}
diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.UnterminatedString, "Unterminated string literal.", $"@{start.Line}:{start.Column}"));
index = source.Length;
line = currentLine;
column = currentColumn;
}
private static void TokenizeNumber(
string source,
ref int index,
ref int line,
ref int column,
SourceLocation start,
ImmutableArray<DslToken>.Builder tokens,
ImmutableArray<PolicyIssue>.Builder diagnostics)
{
var i = index;
var hasDecimal = false;
if (source[i] == '+' || source[i] == '-')
{
i++;
}
while (i < source.Length)
{
var ch = source[i];
if (char.IsDigit(ch))
{
i++;
continue;
}
if (ch == '.')
{
if (hasDecimal)
{
break;
}
hasDecimal = true;
i++;
continue;
}
break;
}
var percent = false;
if (i < source.Length && source[i] == '%')
{
percent = true;
i++;
}
var text = source.Substring(index, i - index);
if (!decimal.TryParse(text, NumberStyles.AllowLeadingSign | NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out var value))
{
diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.InvalidNumber, $"Invalid numeric literal '{text}'.", $"@{start.Line}:{start.Column}"));
index = i;
column += i - index;
return;
}
if (percent)
{
value /= 100m;
}
var end = new SourceLocation(i, line, column + (i - index));
tokens.Add(new DslToken(TokenKind.NumberLiteral, text, new SourceSpan(start, end), value));
column += i - index;
index = i;
}
private static void TokenizeIdentifierOrKeyword(
string source,
ref int index,
ref int line,
ref int column,
SourceLocation start,
ImmutableArray<DslToken>.Builder tokens)
{
var i = index + 1;
while (i < source.Length && IsIdentifierPart(source[i]))
{
i++;
}
var text = source.Substring(index, i - index);
var kind = GetKeywordKind(text);
if (kind == TokenKind.BooleanLiteral)
{
var value = string.Equals(text, "true", StringComparison.Ordinal);
var end = new SourceLocation(i, line, column + (i - index));
tokens.Add(new DslToken(TokenKind.BooleanLiteral, text, new SourceSpan(start, end), value));
}
else if (kind == TokenKind.Identifier)
{
var end = new SourceLocation(i, line, column + (i - index));
tokens.Add(new DslToken(TokenKind.Identifier, text, new SourceSpan(start, end)));
}
else
{
var end = new SourceLocation(i, line, column + (i - index));
tokens.Add(new DslToken(kind, text, new SourceSpan(start, end)));
}
column += i - index;
index = i;
}
private static TokenKind GetKeywordKind(string text)
{
return text switch
{
"policy" => TokenKind.KeywordPolicy,
"syntax" => TokenKind.KeywordSyntax,
"metadata" => TokenKind.KeywordMetadata,
"profile" => TokenKind.KeywordProfile,
"rule" => TokenKind.KeywordRule,
"map" => TokenKind.KeywordMap,
"source" => TokenKind.KeywordSource,
"env" => TokenKind.Identifier,
"if" => TokenKind.KeywordIf,
"then" => TokenKind.KeywordThen,
"when" => TokenKind.KeywordWhen,
"and" => TokenKind.KeywordAnd,
"or" => TokenKind.KeywordOr,
"not" => TokenKind.KeywordNot,
"priority" => TokenKind.KeywordPriority,
"else" => TokenKind.KeywordElse,
"because" => TokenKind.KeywordBecause,
"settings" => TokenKind.KeywordSettings,
"ignore" => TokenKind.KeywordIgnore,
"until" => TokenKind.KeywordUntil,
"escalate" => TokenKind.KeywordEscalate,
"to" => TokenKind.KeywordTo,
"requireVex" => TokenKind.KeywordRequireVex,
"warn" => TokenKind.KeywordWarn,
"message" => TokenKind.KeywordMessage,
"defer" => TokenKind.KeywordDefer,
"annotate" => TokenKind.KeywordAnnotate,
"in" => TokenKind.KeywordIn,
"true" => TokenKind.BooleanLiteral,
"false" => TokenKind.BooleanLiteral,
_ => TokenKind.Identifier,
};
}
private static bool IsIdentifierStart(char ch) => char.IsLetter(ch) || ch == '_';
private static bool IsIdentifierPart(char ch) => char.IsLetterOrDigit(ch) || ch == '_' || ch == '-';
private static (int Index, int Line, int Column) AdvanceWhitespace(string source, int index, int line, int column)
{
var i = index;
var currentLine = line;
var currentColumn = column;
while (i < source.Length)
{
var ch = source[i];
if (ch == '\r')
{
if (i + 1 < source.Length && source[i + 1] == '\n')
{
i += 2;
}
else
{
i++;
}
currentLine++;
currentColumn = 1;
continue;
}
if (ch == '\n')
{
i++;
currentLine++;
currentColumn = 1;
continue;
}
if (!char.IsWhiteSpace(ch))
{
break;
}
i++;
currentColumn++;
}
return (i, currentLine, currentColumn);
}
private static (int Index, int Line, int Column) SkipSingleLineComment(string source, int index, int line, int column)
{
var i = index;
var currentLine = line;
var currentColumn = column;
while (i < source.Length)
{
var ch = source[i];
if (ch == '\r' || ch == '\n')
{
return AdvanceWhitespace(source, i, currentLine, currentColumn);
}
i++;
currentColumn++;
}
return (i, currentLine, currentColumn);
}
private static (int Index, int Line, int Column) SkipMultiLineComment(
string source,
int index,
int line,
int column,
ImmutableArray<PolicyIssue>.Builder diagnostics)
{
var i = index;
var currentLine = line;
var currentColumn = column;
while (i < source.Length)
{
var ch = source[i];
if (ch == '*' && i + 1 < source.Length && source[i + 1] == '/')
{
return (i + 2, currentLine, currentColumn + 2);
}
if (ch == '\r')
{
if (i + 1 < source.Length && source[i + 1] == '\n')
{
i += 2;
}
else
{
i++;
}
currentLine++;
currentColumn = 1;
continue;
}
if (ch == '\n')
{
i++;
currentLine++;
currentColumn = 1;
continue;
}
i++;
currentColumn++;
}
diagnostics.Add(PolicyIssue.Error(DiagnosticCodes.UnexpectedCharacter, "Unterminated comment block.", $"@{line}:{column}"));
return (source.Length, currentLine, currentColumn);
}
private static DslToken CreateToken(
TokenKind kind,
string text,
SourceLocation start,
ref int index,
ref int column,
int advance = 1)
{
var end = new SourceLocation(index + advance, start.Line, start.Column + advance);
index += advance;
column += advance;
return new DslToken(kind, text, new SourceSpan(start, end));
}
private static void ReportUnexpectedCharacter(
ImmutableArray<PolicyIssue>.Builder diagnostics,
char ch,
SourceLocation location)
{
diagnostics.Add(PolicyIssue.Error(
DiagnosticCodes.UnexpectedCharacter,
$"Unexpected character '{ch}'.",
$"@{location.Line}:{location.Column}"));
}
private static bool Match(string source, int index, char expected) =>
index < source.Length && source[index] == expected;
}
/// <summary>
/// Result of tokenizing a policy DSL source.
/// </summary>
public readonly record struct TokenizerResult(
ImmutableArray<DslToken> Tokens,
ImmutableArray<PolicyIssue> Diagnostics);