828 lines
22 KiB
C#
828 lines
22 KiB
C#
using System;
|
|
|
|
namespace Flee.Parsing
|
|
{
|
|
/**
|
|
* A non-deterministic finite state automaton (NFA) for matching
|
|
* tokens. It supports both fixed strings and simple regular
|
|
* expressions, but should perform similar to a DFA due to highly
|
|
* optimized data structures and tuning. The memory footprint during
|
|
* matching should be near zero, since no heap memory is allocated
|
|
* unless the pre-allocated queues need to be enlarged. The NFA also
|
|
* does not use recursion, but iterates in a loop instead.
|
|
*/
|
|
internal class TokenNFA
|
|
{
|
|
private readonly NFAState[] _initialChar = new NFAState[128];
|
|
private readonly NFAState _initial = new NFAState();
|
|
private readonly NFAStateQueue _queue = new NFAStateQueue();
|
|
|
|
public void AddTextMatch(string str, bool ignoreCase, TokenPattern value)
|
|
{
|
|
NFAState state;
|
|
char ch = str[0];
|
|
|
|
if (ch < 128 && !ignoreCase)
|
|
{
|
|
state = _initialChar[ch];
|
|
if (state == null)
|
|
{
|
|
state = _initialChar[ch] = new NFAState();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
state = _initial.AddOut(ch, ignoreCase, null);
|
|
}
|
|
for (int i = 1; i < str.Length; i++)
|
|
{
|
|
state = state.AddOut(str[i], ignoreCase, null);
|
|
}
|
|
state.Value = value;
|
|
}
|
|
|
|
public void AddRegExpMatch(string pattern,
|
|
bool ignoreCase,
|
|
TokenPattern value)
|
|
{
|
|
TokenRegExpParser parser = new TokenRegExpParser(pattern, ignoreCase);
|
|
string debug = "DFA regexp; " + parser.GetDebugInfo();
|
|
|
|
var isAscii = parser.Start.IsAsciiOutgoing();
|
|
for (int i = 0; isAscii && i < 128; i++)
|
|
{
|
|
bool match = false;
|
|
for (int j = 0; j < parser.Start.Outgoing.Length; j++)
|
|
{
|
|
if (parser.Start.Outgoing[j].Match((char)i))
|
|
{
|
|
if (match)
|
|
{
|
|
isAscii = false;
|
|
break;
|
|
}
|
|
match = true;
|
|
}
|
|
}
|
|
if (match && _initialChar[i] != null)
|
|
{
|
|
isAscii = false;
|
|
}
|
|
}
|
|
if (parser.Start.Incoming.Length > 0)
|
|
{
|
|
_initial.AddOut(new NFAEpsilonTransition(parser.Start));
|
|
debug += ", uses initial epsilon";
|
|
}
|
|
else if (isAscii && !ignoreCase)
|
|
{
|
|
for (int i = 0; isAscii && i < 128; i++)
|
|
{
|
|
for (int j = 0; j < parser.Start.Outgoing.Length; j++)
|
|
{
|
|
if (parser.Start.Outgoing[j].Match((char)i))
|
|
{
|
|
_initialChar[i] = parser.Start.Outgoing[j].State;
|
|
}
|
|
}
|
|
}
|
|
debug += ", uses ASCII lookup";
|
|
}
|
|
else
|
|
{
|
|
parser.Start.MergeInto(_initial);
|
|
debug += ", uses initial state";
|
|
}
|
|
parser.End.Value = value;
|
|
value.DebugInfo = debug;
|
|
}
|
|
|
|
public int Match(ReaderBuffer buffer, TokenMatch match)
|
|
{
|
|
int length = 0;
|
|
int pos = 1;
|
|
NFAState state;
|
|
|
|
// The first step of the match loop has been unrolled and
|
|
// optimized for performance below.
|
|
this._queue.Clear();
|
|
var peekChar = buffer.Peek(0);
|
|
if (0 <= peekChar && peekChar < 128)
|
|
{
|
|
state = this._initialChar[peekChar];
|
|
if (state != null)
|
|
{
|
|
this._queue.AddLast(state);
|
|
}
|
|
}
|
|
if (peekChar >= 0)
|
|
{
|
|
this._initial.MatchTransitions((char)peekChar, this._queue, true);
|
|
}
|
|
this._queue.MarkEnd();
|
|
peekChar = buffer.Peek(1);
|
|
|
|
// The remaining match loop processes all subsequent states
|
|
while (!this._queue.Empty)
|
|
{
|
|
if (this._queue.Marked)
|
|
{
|
|
pos++;
|
|
peekChar = buffer.Peek(pos);
|
|
this._queue.MarkEnd();
|
|
}
|
|
state = this._queue.RemoveFirst();
|
|
if (state.Value != null)
|
|
{
|
|
match.Update(pos, state.Value);
|
|
}
|
|
if (peekChar >= 0)
|
|
{
|
|
state.MatchTransitions((char)peekChar, this._queue, false);
|
|
}
|
|
}
|
|
return length;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* An NFA state. The NFA consists of a series of states, each
|
|
* having zero or more transitions to other states.
|
|
*/
|
|
internal class NFAState
|
|
{
|
|
internal TokenPattern Value = null;
|
|
internal NFATransition[] Incoming = new NFATransition[0];
|
|
internal NFATransition[] Outgoing = new NFATransition[0];
|
|
internal bool EpsilonOut = false;
|
|
|
|
public bool HasTransitions()
|
|
{
|
|
return Incoming.Length > 0 || Outgoing.Length > 0;
|
|
}
|
|
public bool IsAsciiOutgoing()
|
|
{
|
|
for (int i = 0; i < Outgoing.Length; i++)
|
|
{
|
|
if (!Outgoing[i].IsAscii())
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
public void AddIn(NFATransition trans)
|
|
{
|
|
Array.Resize(ref Incoming, Incoming.Length + 1);
|
|
Incoming[Incoming.Length - 1] = trans;
|
|
}
|
|
|
|
public NFAState AddOut(char ch, bool ignoreCase, NFAState state)
|
|
{
|
|
if (ignoreCase)
|
|
{
|
|
if (state == null)
|
|
{
|
|
state = new NFAState();
|
|
}
|
|
AddOut(new NFACharTransition(Char.ToLower(ch), state));
|
|
AddOut(new NFACharTransition(Char.ToUpper(ch), state));
|
|
return state;
|
|
}
|
|
else
|
|
{
|
|
if (state == null)
|
|
{
|
|
state = FindUniqueCharTransition(ch);
|
|
if (state != null)
|
|
{
|
|
return state;
|
|
}
|
|
state = new NFAState();
|
|
}
|
|
return AddOut(new NFACharTransition(ch, state));
|
|
}
|
|
}
|
|
|
|
public NFAState AddOut(NFATransition trans)
|
|
{
|
|
Array.Resize(ref Outgoing, Outgoing.Length + 1);
|
|
Outgoing[Outgoing.Length - 1] = trans;
|
|
if (trans is NFAEpsilonTransition)
|
|
{
|
|
EpsilonOut = true;
|
|
}
|
|
return trans.State;
|
|
}
|
|
|
|
public void MergeInto(NFAState state)
|
|
{
|
|
for (int i = 0; i < Incoming.Length; i++)
|
|
{
|
|
state.AddIn(Incoming[i]);
|
|
Incoming[i].State = state;
|
|
}
|
|
Incoming = null;
|
|
for (int i = 0; i < Outgoing.Length; i++)
|
|
{
|
|
state.AddOut(Outgoing[i]);
|
|
}
|
|
Outgoing = null;
|
|
}
|
|
|
|
private NFAState FindUniqueCharTransition(char ch)
|
|
{
|
|
NFATransition res = null;
|
|
NFATransition trans;
|
|
|
|
for (int i = 0; i < Outgoing.Length; i++)
|
|
{
|
|
trans = Outgoing[i];
|
|
if (trans.Match(ch) && trans is NFACharTransition)
|
|
{
|
|
if (res != null)
|
|
{
|
|
return null;
|
|
}
|
|
res = trans;
|
|
}
|
|
}
|
|
for (int i = 0; res != null && i < Outgoing.Length; i++)
|
|
{
|
|
trans = Outgoing[i];
|
|
if (trans != res && trans.State == res.State)
|
|
{
|
|
return null;
|
|
}
|
|
}
|
|
return res?.State;
|
|
}
|
|
|
|
public void MatchTransitions(char ch, NFAStateQueue queue, bool initial)
|
|
{
|
|
for (int i = 0; i < Outgoing.Length; i++)
|
|
{
|
|
var trans = Outgoing[i];
|
|
var target = trans.State;
|
|
if (initial && trans is NFAEpsilonTransition)
|
|
{
|
|
target.MatchTransitions(ch, queue, true);
|
|
}
|
|
else if (trans.Match(ch))
|
|
{
|
|
queue.AddLast(target);
|
|
if (target.EpsilonOut)
|
|
{
|
|
target.MatchEmpty(queue);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
public void MatchEmpty(NFAStateQueue queue)
|
|
{
|
|
for (int i = 0; i < Outgoing.Length; i++)
|
|
{
|
|
var trans = Outgoing[i];
|
|
if (trans is NFAEpsilonTransition)
|
|
{
|
|
var target = trans.State;
|
|
queue.AddLast(target);
|
|
if (target.EpsilonOut)
|
|
{
|
|
target.MatchEmpty(queue);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* An NFA state transition. A transition checks a single
|
|
* character of input an determines if it is a match. If a match
|
|
* is encountered, the NFA should move forward to the transition
|
|
* state.
|
|
*/
|
|
internal abstract class NFATransition
|
|
{
|
|
|
|
internal NFAState State;
|
|
|
|
protected NFATransition(NFAState state)
|
|
{
|
|
this.State = state;
|
|
this.State.AddIn(this);
|
|
}
|
|
|
|
public abstract bool IsAscii();
|
|
|
|
public abstract bool Match(char ch);
|
|
|
|
public abstract NFATransition Copy(NFAState state);
|
|
}
|
|
|
|
|
|
/**
|
|
* The special epsilon transition. This transition matches the
|
|
* empty input, i.e. it is an automatic transition that doesn't
|
|
* read any input. As such, it returns false in the match method
|
|
* and is handled specially everywhere.
|
|
*/
|
|
internal class NFAEpsilonTransition : NFATransition
|
|
{
|
|
public NFAEpsilonTransition(NFAState state) : base(state)
|
|
{
|
|
}
|
|
|
|
public override bool IsAscii()
|
|
{
|
|
return false;
|
|
}
|
|
|
|
public override bool Match(char ch)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
public override NFATransition Copy(NFAState state)
|
|
{
|
|
return new NFAEpsilonTransition(state);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* A single character match transition.
|
|
*/
|
|
internal class NFACharTransition : NFATransition
|
|
{
|
|
private readonly char _match;
|
|
|
|
public NFACharTransition(char match, NFAState state) : base(state)
|
|
{
|
|
_match = match;
|
|
}
|
|
|
|
public override bool IsAscii()
|
|
{
|
|
return 0 <= _match && _match < 128;
|
|
}
|
|
|
|
public override bool Match(char ch)
|
|
{
|
|
return this._match == ch;
|
|
}
|
|
|
|
public override NFATransition Copy(NFAState state)
|
|
{
|
|
return new NFACharTransition(_match, state);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* A character range match transition. Used for user-defined
|
|
* character sets in regular expressions.
|
|
*/
|
|
internal class NFACharRangeTransition : NFATransition
|
|
{
|
|
|
|
protected bool Inverse;
|
|
protected bool IgnoreCase;
|
|
|
|
private object[] _contents = new object[0];
|
|
|
|
public NFACharRangeTransition(bool inverse,
|
|
bool ignoreCase,
|
|
NFAState state) : base(state)
|
|
{
|
|
this.Inverse = inverse;
|
|
this.IgnoreCase = ignoreCase;
|
|
}
|
|
|
|
public override bool IsAscii()
|
|
{
|
|
if (Inverse)
|
|
{
|
|
return false;
|
|
}
|
|
for (int i = 0; i < _contents.Length; i++)
|
|
{
|
|
var obj = _contents[i];
|
|
if (obj is char)
|
|
{
|
|
var c = (char)obj;
|
|
if (c < 0 || 128 <= c)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
else if (obj is Range)
|
|
{
|
|
if (!((Range)obj).IsAscii())
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
public void AddCharacter(char c)
|
|
{
|
|
if (IgnoreCase)
|
|
{
|
|
c = Char.ToLower(c);
|
|
}
|
|
AddContent(c);
|
|
}
|
|
|
|
public void AddRange(char min, char max)
|
|
{
|
|
if (IgnoreCase)
|
|
{
|
|
min = Char.ToLower(min);
|
|
max = Char.ToLower(max);
|
|
}
|
|
AddContent(new Range(min, max));
|
|
}
|
|
|
|
private void AddContent(Object obj)
|
|
{
|
|
Array.Resize(ref _contents, _contents.Length + 1);
|
|
_contents[_contents.Length - 1] = obj;
|
|
}
|
|
|
|
public override bool Match(char ch)
|
|
{
|
|
object obj;
|
|
char c;
|
|
Range r;
|
|
|
|
if (IgnoreCase)
|
|
{
|
|
ch = Char.ToLower(ch);
|
|
}
|
|
for (int i = 0; i < _contents.Length; i++)
|
|
{
|
|
obj = _contents[i];
|
|
if (obj is char)
|
|
{
|
|
c = (char)obj;
|
|
if (c == ch)
|
|
{
|
|
return !Inverse;
|
|
}
|
|
}
|
|
else if (obj is Range)
|
|
{
|
|
r = (Range)obj;
|
|
if (r.Inside(ch))
|
|
{
|
|
return !Inverse;
|
|
}
|
|
}
|
|
}
|
|
return Inverse;
|
|
}
|
|
|
|
public override NFATransition Copy(NFAState state)
|
|
{
|
|
var copy = new NFACharRangeTransition(Inverse, IgnoreCase, state) { _contents = _contents };
|
|
return copy;
|
|
}
|
|
|
|
private class Range
|
|
{
|
|
private readonly char _min;
|
|
private readonly char _max;
|
|
|
|
public Range(char min, char max)
|
|
{
|
|
this._min = min;
|
|
this._max = max;
|
|
}
|
|
|
|
public bool IsAscii()
|
|
{
|
|
return 0 <= _min && _min < 128 &&
|
|
0 <= _max && _max < 128;
|
|
}
|
|
|
|
public bool Inside(char c)
|
|
{
|
|
return _min <= c && c <= _max;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* The dot ('.') character set transition. This transition
|
|
* matches a single character that is not equal to a newline
|
|
* character.
|
|
*/
|
|
internal class NFADotTransition : NFATransition
|
|
{
|
|
public NFADotTransition(NFAState state) : base(state)
|
|
{
|
|
}
|
|
|
|
public override bool IsAscii()
|
|
{
|
|
return false;
|
|
}
|
|
|
|
public override bool Match(char ch)
|
|
{
|
|
switch (ch)
|
|
{
|
|
case '\n':
|
|
case '\r':
|
|
case '\u0085':
|
|
case '\u2028':
|
|
case '\u2029':
|
|
return false;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
public override NFATransition Copy(NFAState state)
|
|
{
|
|
return new NFADotTransition(state);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* The digit character set transition. This transition matches a
|
|
* single numeric character.
|
|
*/
|
|
internal class NFADigitTransition : NFATransition
|
|
{
|
|
public NFADigitTransition(NFAState state) : base(state)
|
|
{
|
|
}
|
|
|
|
public override bool IsAscii()
|
|
{
|
|
return true;
|
|
}
|
|
|
|
public override bool Match(char ch)
|
|
{
|
|
return '0' <= ch && ch <= '9';
|
|
}
|
|
|
|
public override NFATransition Copy(NFAState state)
|
|
{
|
|
return new NFADigitTransition(state);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* The non-digit character set transition. This transition
|
|
* matches a single non-numeric character.
|
|
*/
|
|
internal class NFANonDigitTransition : NFATransition
|
|
{
|
|
public NFANonDigitTransition(NFAState state) : base(state)
|
|
{
|
|
}
|
|
|
|
public override bool IsAscii()
|
|
{
|
|
return false;
|
|
}
|
|
|
|
public override bool Match(char ch)
|
|
{
|
|
return ch < '0' || '9' < ch;
|
|
}
|
|
|
|
public override NFATransition Copy(NFAState state)
|
|
{
|
|
return new NFANonDigitTransition(state);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* The whitespace character set transition. This transition
|
|
* matches a single whitespace character.
|
|
*/
|
|
internal class NFAWhitespaceTransition : NFATransition
|
|
{
|
|
public NFAWhitespaceTransition(NFAState state) : base(state)
|
|
{
|
|
}
|
|
|
|
public override bool IsAscii()
|
|
{
|
|
return true;
|
|
}
|
|
|
|
public override bool Match(char ch)
|
|
{
|
|
switch (ch)
|
|
{
|
|
case ' ':
|
|
case '\t':
|
|
case '\n':
|
|
case '\f':
|
|
case '\r':
|
|
case (char)11:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public override NFATransition Copy(NFAState state)
|
|
{
|
|
return new NFAWhitespaceTransition(state);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* The non-whitespace character set transition. This transition
|
|
* matches a single non-whitespace character.
|
|
*/
|
|
internal class NFANonWhitespaceTransition : NFATransition
|
|
{
|
|
|
|
public NFANonWhitespaceTransition(NFAState state) : base(state)
|
|
{
|
|
}
|
|
|
|
public override bool IsAscii()
|
|
{
|
|
return false;
|
|
}
|
|
|
|
public override bool Match(char ch)
|
|
{
|
|
switch (ch)
|
|
{
|
|
case ' ':
|
|
case '\t':
|
|
case '\n':
|
|
case '\f':
|
|
case '\r':
|
|
case (char)11:
|
|
return false;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
public override NFATransition Copy(NFAState state)
|
|
{
|
|
return new NFANonWhitespaceTransition(state);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* The word character set transition. This transition matches a
|
|
* single word character.
|
|
*/
|
|
internal class NFAWordTransition : NFATransition
|
|
{
|
|
|
|
public NFAWordTransition(NFAState state) : base(state)
|
|
{
|
|
}
|
|
|
|
public override bool IsAscii()
|
|
{
|
|
return true;
|
|
}
|
|
|
|
|
|
public override bool Match(char ch)
|
|
{
|
|
return ('a' <= ch && ch <= 'z')
|
|
|| ('A' <= ch && ch <= 'Z')
|
|
|| ('0' <= ch && ch <= '9')
|
|
|| ch == '_';
|
|
}
|
|
|
|
public override NFATransition Copy(NFAState state)
|
|
{
|
|
return new NFAWordTransition(state);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* The non-word character set transition. This transition matches
|
|
* a single non-word character.
|
|
*/
|
|
internal class NFANonWordTransition : NFATransition
|
|
{
|
|
public NFANonWordTransition(NFAState state) : base(state)
|
|
{
|
|
}
|
|
|
|
public override bool IsAscii()
|
|
{
|
|
return false;
|
|
}
|
|
|
|
public override bool Match(char ch)
|
|
{
|
|
bool word = ('a' <= ch && ch <= 'z')
|
|
|| ('A' <= ch && ch <= 'Z')
|
|
|| ('0' <= ch && ch <= '9')
|
|
|| ch == '_';
|
|
return !word;
|
|
}
|
|
|
|
public override NFATransition Copy(NFAState state)
|
|
{
|
|
return new NFANonWordTransition(state);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* An NFA state queue. This queue is used during processing to
|
|
* keep track of the current and subsequent NFA states. The
|
|
* current state is read from the beginning of the queue, and new
|
|
* states are added at the end. A marker index is used to
|
|
* separate the current from the subsequent states.<p>
|
|
*
|
|
* The queue implementation is optimized for quick removal at the
|
|
* beginning and addition at the end. It will attempt to use a
|
|
* fixed-size array to store the whole queue, and moves the data
|
|
* in this array only when absolutely needed. The array is also
|
|
* enlarged automatically if too many states are being processed
|
|
* at a single time.
|
|
*/
|
|
internal class NFAStateQueue
|
|
{
|
|
|
|
private NFAState[] _queue = new NFAState[2048];
|
|
|
|
private int _first = 0;
|
|
|
|
private int _last = 0;
|
|
|
|
private int _mark = 0;
|
|
|
|
public bool Empty => (_last <= _first);
|
|
|
|
public bool Marked => _first == _mark;
|
|
|
|
public void Clear()
|
|
{
|
|
_first = 0;
|
|
_last = 0;
|
|
_mark = 0;
|
|
}
|
|
|
|
public void MarkEnd()
|
|
{
|
|
_mark = _last;
|
|
}
|
|
|
|
public NFAState RemoveFirst()
|
|
{
|
|
if (_first < _last)
|
|
{
|
|
_first++;
|
|
return _queue[_first - 1];
|
|
}
|
|
else
|
|
{
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public void AddLast(NFAState state)
|
|
{
|
|
if (_last >= _queue.Length)
|
|
{
|
|
if (_first <= 0)
|
|
{
|
|
Array.Resize(ref _queue, _queue.Length * 2);
|
|
}
|
|
else
|
|
{
|
|
Array.Copy(_queue, _first, _queue, 0, _last - _first);
|
|
_last -= _first;
|
|
_mark -= _first;
|
|
_first = 0;
|
|
}
|
|
}
|
|
_queue[_last++] = state;
|
|
}
|
|
}
|
|
}
|