547 lines
17 KiB
C#
547 lines
17 KiB
C#
using System;
|
|
using System.Collections;
|
|
using System.Globalization;
|
|
using System.Text;
|
|
|
|
namespace Flee.Parsing
|
|
{
|
|
/**
|
|
* A regular expression parser. The parser creates an NFA for the
|
|
* regular expression having a single start and acceptance states.
|
|
*/
|
|
internal class TokenRegExpParser
|
|
{
|
|
private readonly string _pattern;
|
|
private readonly bool _ignoreCase;
|
|
private int _pos;
|
|
internal NFAState Start = new NFAState();
|
|
internal NFAState End;
|
|
private int _stateCount;
|
|
private int _transitionCount;
|
|
private int _epsilonCount;
|
|
|
|
public TokenRegExpParser(string pattern) : this(pattern, false)
|
|
{
|
|
}
|
|
|
|
public TokenRegExpParser(string pattern, bool ignoreCase)
|
|
{
|
|
this._pattern = pattern;
|
|
this._ignoreCase = ignoreCase;
|
|
this._pos = 0;
|
|
this.End = ParseExpr(Start);
|
|
if (_pos < pattern.Length)
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNEXPECTED_CHARACTER,
|
|
_pos,
|
|
pattern);
|
|
}
|
|
}
|
|
|
|
public string GetDebugInfo()
|
|
{
|
|
if (_stateCount == 0)
|
|
{
|
|
UpdateStats(Start, new Hashtable());
|
|
}
|
|
return _stateCount + " states, " +
|
|
_transitionCount + " transitions, " +
|
|
_epsilonCount + " epsilons";
|
|
}
|
|
|
|
private void UpdateStats(NFAState state, Hashtable visited)
|
|
{
|
|
if (!visited.ContainsKey(state))
|
|
{
|
|
visited.Add(state, state);
|
|
_stateCount++;
|
|
for (int i = 0; i < state.Outgoing.Length; i++)
|
|
{
|
|
_transitionCount++;
|
|
if (state.Outgoing[i] is NFAEpsilonTransition)
|
|
{
|
|
_epsilonCount++;
|
|
}
|
|
UpdateStats(state.Outgoing[i].State, visited);
|
|
}
|
|
}
|
|
}
|
|
|
|
private NFAState ParseExpr(NFAState start)
|
|
{
|
|
NFAState end = new NFAState();
|
|
do
|
|
{
|
|
if (PeekChar(0) == '|')
|
|
{
|
|
ReadChar('|');
|
|
}
|
|
var subStart = new NFAState();
|
|
var subEnd = ParseTerm(subStart);
|
|
if (subStart.Incoming.Length == 0)
|
|
{
|
|
subStart.MergeInto(start);
|
|
}
|
|
else
|
|
{
|
|
start.AddOut(new NFAEpsilonTransition(subStart));
|
|
}
|
|
if (subEnd.Outgoing.Length == 0 ||
|
|
(!end.HasTransitions() && PeekChar(0) != '|'))
|
|
{
|
|
subEnd.MergeInto(end);
|
|
}
|
|
else
|
|
{
|
|
subEnd.AddOut(new NFAEpsilonTransition(end));
|
|
}
|
|
} while (PeekChar(0) == '|');
|
|
return end;
|
|
}
|
|
|
|
private NFAState ParseTerm(NFAState start)
|
|
{
|
|
var end = ParseFact(start);
|
|
while (true)
|
|
{
|
|
switch (PeekChar(0))
|
|
{
|
|
case -1:
|
|
case ')':
|
|
case ']':
|
|
case '{':
|
|
case '}':
|
|
case '?':
|
|
case '+':
|
|
case '|':
|
|
return end;
|
|
default:
|
|
end = ParseFact(end);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
private NFAState ParseFact(NFAState start)
|
|
{
|
|
NFAState placeholder = new NFAState();
|
|
|
|
var end = ParseAtom(placeholder);
|
|
switch (PeekChar(0))
|
|
{
|
|
case '?':
|
|
case '*':
|
|
case '+':
|
|
case '{':
|
|
end = ParseAtomModifier(placeholder, end);
|
|
break;
|
|
}
|
|
if (placeholder.Incoming.Length > 0 && start.Outgoing.Length > 0)
|
|
{
|
|
start.AddOut(new NFAEpsilonTransition(placeholder));
|
|
return end;
|
|
}
|
|
else
|
|
{
|
|
placeholder.MergeInto(start);
|
|
return (end == placeholder) ? start : end;
|
|
}
|
|
}
|
|
|
|
private NFAState ParseAtom(NFAState start)
|
|
{
|
|
NFAState end;
|
|
|
|
switch (PeekChar(0))
|
|
{
|
|
case '.':
|
|
ReadChar('.');
|
|
return start.AddOut(new NFADotTransition(new NFAState()));
|
|
case '(':
|
|
ReadChar('(');
|
|
end = ParseExpr(start);
|
|
ReadChar(')');
|
|
return end;
|
|
case '[':
|
|
ReadChar('[');
|
|
end = ParseCharSet(start);
|
|
ReadChar(']');
|
|
return end;
|
|
case -1:
|
|
case ')':
|
|
case ']':
|
|
case '{':
|
|
case '}':
|
|
case '?':
|
|
case '*':
|
|
case '+':
|
|
case '|':
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNEXPECTED_CHARACTER,
|
|
_pos,
|
|
_pattern);
|
|
default:
|
|
return ParseChar(start);
|
|
}
|
|
}
|
|
|
|
private NFAState ParseAtomModifier(NFAState start, NFAState end)
|
|
{
|
|
int min = 0;
|
|
int max = -1;
|
|
int firstPos = _pos;
|
|
|
|
// Read min and max
|
|
switch (ReadChar())
|
|
{
|
|
case '?':
|
|
min = 0;
|
|
max = 1;
|
|
break;
|
|
case '*':
|
|
min = 0;
|
|
max = -1;
|
|
break;
|
|
case '+':
|
|
min = 1;
|
|
max = -1;
|
|
break;
|
|
case '{':
|
|
min = ReadNumber();
|
|
max = min;
|
|
if (PeekChar(0) == ',')
|
|
{
|
|
ReadChar(',');
|
|
max = -1;
|
|
if (PeekChar(0) != '}')
|
|
{
|
|
max = ReadNumber();
|
|
}
|
|
}
|
|
ReadChar('}');
|
|
if (max == 0 || (max > 0 && min > max))
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.INVALID_REPEAT_COUNT,
|
|
firstPos,
|
|
_pattern);
|
|
}
|
|
break;
|
|
default:
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNEXPECTED_CHARACTER,
|
|
_pos - 1,
|
|
_pattern);
|
|
}
|
|
|
|
// Read possessive or reluctant modifiers
|
|
if (PeekChar(0) == '?')
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER,
|
|
_pos,
|
|
_pattern);
|
|
}
|
|
else if (PeekChar(0) == '+')
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER,
|
|
_pos,
|
|
_pattern);
|
|
}
|
|
|
|
// Handle supported repeaters
|
|
if (min == 0 && max == 1)
|
|
{
|
|
return start.AddOut(new NFAEpsilonTransition(end));
|
|
}
|
|
else if (min == 0 && max == -1)
|
|
{
|
|
if (end.Outgoing.Length == 0)
|
|
{
|
|
end.MergeInto(start);
|
|
}
|
|
else
|
|
{
|
|
end.AddOut(new NFAEpsilonTransition(start));
|
|
}
|
|
return start;
|
|
}
|
|
else if (min == 1 && max == -1)
|
|
{
|
|
if (start.Outgoing.Length == 1 &&
|
|
end.Outgoing.Length == 0 &&
|
|
end.Incoming.Length == 1 &&
|
|
start.Outgoing[0] == end.Incoming[0])
|
|
{
|
|
|
|
end.AddOut(start.Outgoing[0].Copy(end));
|
|
}
|
|
else
|
|
{
|
|
end.AddOut(new NFAEpsilonTransition(start));
|
|
}
|
|
return end;
|
|
}
|
|
else
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.INVALID_REPEAT_COUNT,
|
|
firstPos,
|
|
_pattern);
|
|
}
|
|
}
|
|
|
|
private NFAState ParseCharSet(NFAState start)
|
|
{
|
|
NFAState end = new NFAState();
|
|
NFACharRangeTransition range;
|
|
|
|
if (PeekChar(0) == '^')
|
|
{
|
|
ReadChar('^');
|
|
range = new NFACharRangeTransition(true, _ignoreCase, end);
|
|
}
|
|
else
|
|
{
|
|
range = new NFACharRangeTransition(false, _ignoreCase, end);
|
|
}
|
|
start.AddOut(range);
|
|
while (PeekChar(0) > 0)
|
|
{
|
|
var min = (char)PeekChar(0);
|
|
switch (min)
|
|
{
|
|
case ']':
|
|
return end;
|
|
case '\\':
|
|
range.AddCharacter(ReadEscapeChar());
|
|
break;
|
|
default:
|
|
ReadChar(min);
|
|
if (PeekChar(0) == '-' &&
|
|
PeekChar(1) > 0 &&
|
|
PeekChar(1) != ']')
|
|
{
|
|
|
|
ReadChar('-');
|
|
var max = ReadChar();
|
|
range.AddRange(min, max);
|
|
}
|
|
else
|
|
{
|
|
range.AddCharacter(min);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return end;
|
|
}
|
|
|
|
private NFAState ParseChar(NFAState start)
|
|
{
|
|
switch (PeekChar(0))
|
|
{
|
|
case '\\':
|
|
return ParseEscapeChar(start);
|
|
case '^':
|
|
case '$':
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER,
|
|
_pos,
|
|
_pattern);
|
|
default:
|
|
return start.AddOut(ReadChar(), _ignoreCase, new NFAState());
|
|
}
|
|
}
|
|
|
|
private NFAState ParseEscapeChar(NFAState start)
|
|
{
|
|
NFAState end = new NFAState();
|
|
|
|
if (PeekChar(0) == '\\' && PeekChar(1) > 0)
|
|
{
|
|
switch ((char)PeekChar(1))
|
|
{
|
|
case 'd':
|
|
ReadChar();
|
|
ReadChar();
|
|
return start.AddOut(new NFADigitTransition(end));
|
|
case 'D':
|
|
ReadChar();
|
|
ReadChar();
|
|
return start.AddOut(new NFANonDigitTransition(end));
|
|
case 's':
|
|
ReadChar();
|
|
ReadChar();
|
|
return start.AddOut(new NFAWhitespaceTransition(end));
|
|
case 'S':
|
|
ReadChar();
|
|
ReadChar();
|
|
return start.AddOut(new NFANonWhitespaceTransition(end));
|
|
case 'w':
|
|
ReadChar();
|
|
ReadChar();
|
|
return start.AddOut(new NFAWordTransition(end));
|
|
case 'W':
|
|
ReadChar();
|
|
ReadChar();
|
|
return start.AddOut(new NFANonWordTransition(end));
|
|
}
|
|
}
|
|
return start.AddOut(ReadEscapeChar(), _ignoreCase, end);
|
|
}
|
|
|
|
private char ReadEscapeChar()
|
|
{
|
|
string str;
|
|
int value;
|
|
|
|
ReadChar('\\');
|
|
var c = ReadChar();
|
|
switch (c)
|
|
{
|
|
case '0':
|
|
c = ReadChar();
|
|
if (c < '0' || c > '3')
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
|
|
_pos - 3,
|
|
_pattern);
|
|
}
|
|
value = c - '0';
|
|
c = (char)PeekChar(0);
|
|
if ('0' <= c && c <= '7')
|
|
{
|
|
value *= 8;
|
|
value += ReadChar() - '0';
|
|
c = (char)PeekChar(0);
|
|
if ('0' <= c && c <= '7')
|
|
{
|
|
value *= 8;
|
|
value += ReadChar() - '0';
|
|
}
|
|
}
|
|
return (char)value;
|
|
case 'x':
|
|
str = ReadChar().ToString() + ReadChar().ToString();
|
|
try
|
|
{
|
|
value = Int32.Parse(str, NumberStyles.AllowHexSpecifier);
|
|
return (char)value;
|
|
}
|
|
catch (FormatException)
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
|
|
_pos - str.Length - 2,
|
|
_pattern);
|
|
}
|
|
case 'u':
|
|
str = ReadChar().ToString() +
|
|
ReadChar().ToString() +
|
|
ReadChar().ToString() +
|
|
ReadChar().ToString();
|
|
try
|
|
{
|
|
value = Int32.Parse(str, NumberStyles.AllowHexSpecifier);
|
|
return (char)value;
|
|
}
|
|
catch (FormatException)
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
|
|
_pos - str.Length - 2,
|
|
_pattern);
|
|
}
|
|
case 't':
|
|
return '\t';
|
|
case 'n':
|
|
return '\n';
|
|
case 'r':
|
|
return '\r';
|
|
case 'f':
|
|
return '\f';
|
|
case 'a':
|
|
return '\u0007';
|
|
case 'e':
|
|
return '\u001B';
|
|
default:
|
|
if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'))
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
|
|
_pos - 2,
|
|
_pattern);
|
|
}
|
|
return c;
|
|
}
|
|
}
|
|
|
|
private int ReadNumber()
|
|
{
|
|
StringBuilder buf = new StringBuilder();
|
|
int c;
|
|
|
|
c = PeekChar(0);
|
|
while ('0' <= c && c <= '9')
|
|
{
|
|
buf.Append(ReadChar());
|
|
c = PeekChar(0);
|
|
}
|
|
if (buf.Length <= 0)
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNEXPECTED_CHARACTER,
|
|
_pos,
|
|
_pattern);
|
|
}
|
|
return Int32.Parse(buf.ToString());
|
|
}
|
|
|
|
private char ReadChar()
|
|
{
|
|
int c = PeekChar(0);
|
|
|
|
if (c < 0)
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNTERMINATED_PATTERN,
|
|
_pos,
|
|
_pattern);
|
|
}
|
|
else
|
|
{
|
|
_pos++;
|
|
return (char)c;
|
|
}
|
|
}
|
|
|
|
private char ReadChar(char c)
|
|
{
|
|
if (c != ReadChar())
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNEXPECTED_CHARACTER,
|
|
_pos - 1,
|
|
_pattern);
|
|
}
|
|
return c;
|
|
}
|
|
|
|
private int PeekChar(int count)
|
|
{
|
|
if (_pos + count < _pattern.Length)
|
|
{
|
|
return _pattern[_pos + count];
|
|
}
|
|
else
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
}
|