Files
Flee/Parsing/TokenNFA.cs

828 lines
22 KiB
C#

using System;
namespace Flee.Parsing
{
/**
* A non-deterministic finite state automaton (NFA) for matching
* tokens. It supports both fixed strings and simple regular
* expressions, but should perform similar to a DFA due to highly
* optimized data structures and tuning. The memory footprint during
* matching should be near zero, since no heap memory is allocated
* unless the pre-allocated queues need to be enlarged. The NFA also
* does not use recursion, but iterates in a loop instead.
*/
internal class TokenNFA
{
private readonly NFAState[] _initialChar = new NFAState[128];
private readonly NFAState _initial = new NFAState();
private readonly NFAStateQueue _queue = new NFAStateQueue();
public void AddTextMatch(string str, bool ignoreCase, TokenPattern value)
{
NFAState state;
char ch = str[0];
if (ch < 128 && !ignoreCase)
{
state = _initialChar[ch];
if (state == null)
{
state = _initialChar[ch] = new NFAState();
}
}
else
{
state = _initial.AddOut(ch, ignoreCase, null);
}
for (int i = 1; i < str.Length; i++)
{
state = state.AddOut(str[i], ignoreCase, null);
}
state.Value = value;
}
public void AddRegExpMatch(string pattern,
bool ignoreCase,
TokenPattern value)
{
TokenRegExpParser parser = new TokenRegExpParser(pattern, ignoreCase);
string debug = "DFA regexp; " + parser.GetDebugInfo();
var isAscii = parser.Start.IsAsciiOutgoing();
for (int i = 0; isAscii && i < 128; i++)
{
bool match = false;
for (int j = 0; j < parser.Start.Outgoing.Length; j++)
{
if (parser.Start.Outgoing[j].Match((char)i))
{
if (match)
{
isAscii = false;
break;
}
match = true;
}
}
if (match && _initialChar[i] != null)
{
isAscii = false;
}
}
if (parser.Start.Incoming.Length > 0)
{
_initial.AddOut(new NFAEpsilonTransition(parser.Start));
debug += ", uses initial epsilon";
}
else if (isAscii && !ignoreCase)
{
for (int i = 0; isAscii && i < 128; i++)
{
for (int j = 0; j < parser.Start.Outgoing.Length; j++)
{
if (parser.Start.Outgoing[j].Match((char)i))
{
_initialChar[i] = parser.Start.Outgoing[j].State;
}
}
}
debug += ", uses ASCII lookup";
}
else
{
parser.Start.MergeInto(_initial);
debug += ", uses initial state";
}
parser.End.Value = value;
value.DebugInfo = debug;
}
public int Match(ReaderBuffer buffer, TokenMatch match)
{
int length = 0;
int pos = 1;
NFAState state;
// The first step of the match loop has been unrolled and
// optimized for performance below.
this._queue.Clear();
var peekChar = buffer.Peek(0);
if (0 <= peekChar && peekChar < 128)
{
state = this._initialChar[peekChar];
if (state != null)
{
this._queue.AddLast(state);
}
}
if (peekChar >= 0)
{
this._initial.MatchTransitions((char)peekChar, this._queue, true);
}
this._queue.MarkEnd();
peekChar = buffer.Peek(1);
// The remaining match loop processes all subsequent states
while (!this._queue.Empty)
{
if (this._queue.Marked)
{
pos++;
peekChar = buffer.Peek(pos);
this._queue.MarkEnd();
}
state = this._queue.RemoveFirst();
if (state.Value != null)
{
match.Update(pos, state.Value);
}
if (peekChar >= 0)
{
state.MatchTransitions((char)peekChar, this._queue, false);
}
}
return length;
}
}
/**
* An NFA state. The NFA consists of a series of states, each
* having zero or more transitions to other states.
*/
internal class NFAState
{
internal TokenPattern Value = null;
internal NFATransition[] Incoming = new NFATransition[0];
internal NFATransition[] Outgoing = new NFATransition[0];
internal bool EpsilonOut = false;
public bool HasTransitions()
{
return Incoming.Length > 0 || Outgoing.Length > 0;
}
public bool IsAsciiOutgoing()
{
for (int i = 0; i < Outgoing.Length; i++)
{
if (!Outgoing[i].IsAscii())
{
return false;
}
}
return true;
}
public void AddIn(NFATransition trans)
{
Array.Resize(ref Incoming, Incoming.Length + 1);
Incoming[Incoming.Length - 1] = trans;
}
public NFAState AddOut(char ch, bool ignoreCase, NFAState state)
{
if (ignoreCase)
{
if (state == null)
{
state = new NFAState();
}
AddOut(new NFACharTransition(Char.ToLower(ch), state));
AddOut(new NFACharTransition(Char.ToUpper(ch), state));
return state;
}
else
{
if (state == null)
{
state = FindUniqueCharTransition(ch);
if (state != null)
{
return state;
}
state = new NFAState();
}
return AddOut(new NFACharTransition(ch, state));
}
}
public NFAState AddOut(NFATransition trans)
{
Array.Resize(ref Outgoing, Outgoing.Length + 1);
Outgoing[Outgoing.Length - 1] = trans;
if (trans is NFAEpsilonTransition)
{
EpsilonOut = true;
}
return trans.State;
}
public void MergeInto(NFAState state)
{
for (int i = 0; i < Incoming.Length; i++)
{
state.AddIn(Incoming[i]);
Incoming[i].State = state;
}
Incoming = null;
for (int i = 0; i < Outgoing.Length; i++)
{
state.AddOut(Outgoing[i]);
}
Outgoing = null;
}
private NFAState FindUniqueCharTransition(char ch)
{
NFATransition res = null;
NFATransition trans;
for (int i = 0; i < Outgoing.Length; i++)
{
trans = Outgoing[i];
if (trans.Match(ch) && trans is NFACharTransition)
{
if (res != null)
{
return null;
}
res = trans;
}
}
for (int i = 0; res != null && i < Outgoing.Length; i++)
{
trans = Outgoing[i];
if (trans != res && trans.State == res.State)
{
return null;
}
}
return res?.State;
}
public void MatchTransitions(char ch, NFAStateQueue queue, bool initial)
{
for (int i = 0; i < Outgoing.Length; i++)
{
var trans = Outgoing[i];
var target = trans.State;
if (initial && trans is NFAEpsilonTransition)
{
target.MatchTransitions(ch, queue, true);
}
else if (trans.Match(ch))
{
queue.AddLast(target);
if (target.EpsilonOut)
{
target.MatchEmpty(queue);
}
}
}
}
public void MatchEmpty(NFAStateQueue queue)
{
for (int i = 0; i < Outgoing.Length; i++)
{
var trans = Outgoing[i];
if (trans is NFAEpsilonTransition)
{
var target = trans.State;
queue.AddLast(target);
if (target.EpsilonOut)
{
target.MatchEmpty(queue);
}
}
}
}
}
/**
* An NFA state transition. A transition checks a single
* character of input an determines if it is a match. If a match
* is encountered, the NFA should move forward to the transition
* state.
*/
internal abstract class NFATransition
{
internal NFAState State;
protected NFATransition(NFAState state)
{
this.State = state;
this.State.AddIn(this);
}
public abstract bool IsAscii();
public abstract bool Match(char ch);
public abstract NFATransition Copy(NFAState state);
}
/**
* The special epsilon transition. This transition matches the
* empty input, i.e. it is an automatic transition that doesn't
* read any input. As such, it returns false in the match method
* and is handled specially everywhere.
*/
internal class NFAEpsilonTransition : NFATransition
{
public NFAEpsilonTransition(NFAState state) : base(state)
{
}
public override bool IsAscii()
{
return false;
}
public override bool Match(char ch)
{
return false;
}
public override NFATransition Copy(NFAState state)
{
return new NFAEpsilonTransition(state);
}
}
/**
* A single character match transition.
*/
internal class NFACharTransition : NFATransition
{
private readonly char _match;
public NFACharTransition(char match, NFAState state) : base(state)
{
_match = match;
}
public override bool IsAscii()
{
return 0 <= _match && _match < 128;
}
public override bool Match(char ch)
{
return this._match == ch;
}
public override NFATransition Copy(NFAState state)
{
return new NFACharTransition(_match, state);
}
}
/**
* A character range match transition. Used for user-defined
* character sets in regular expressions.
*/
internal class NFACharRangeTransition : NFATransition
{
protected bool Inverse;
protected bool IgnoreCase;
private object[] _contents = new object[0];
public NFACharRangeTransition(bool inverse,
bool ignoreCase,
NFAState state) : base(state)
{
this.Inverse = inverse;
this.IgnoreCase = ignoreCase;
}
public override bool IsAscii()
{
if (Inverse)
{
return false;
}
for (int i = 0; i < _contents.Length; i++)
{
var obj = _contents[i];
if (obj is char)
{
var c = (char)obj;
if (c < 0 || 128 <= c)
{
return false;
}
}
else if (obj is Range)
{
if (!((Range)obj).IsAscii())
{
return false;
}
}
}
return true;
}
public void AddCharacter(char c)
{
if (IgnoreCase)
{
c = Char.ToLower(c);
}
AddContent(c);
}
public void AddRange(char min, char max)
{
if (IgnoreCase)
{
min = Char.ToLower(min);
max = Char.ToLower(max);
}
AddContent(new Range(min, max));
}
private void AddContent(Object obj)
{
Array.Resize(ref _contents, _contents.Length + 1);
_contents[_contents.Length - 1] = obj;
}
public override bool Match(char ch)
{
object obj;
char c;
Range r;
if (IgnoreCase)
{
ch = Char.ToLower(ch);
}
for (int i = 0; i < _contents.Length; i++)
{
obj = _contents[i];
if (obj is char)
{
c = (char)obj;
if (c == ch)
{
return !Inverse;
}
}
else if (obj is Range)
{
r = (Range)obj;
if (r.Inside(ch))
{
return !Inverse;
}
}
}
return Inverse;
}
public override NFATransition Copy(NFAState state)
{
var copy = new NFACharRangeTransition(Inverse, IgnoreCase, state) { _contents = _contents };
return copy;
}
private class Range
{
private readonly char _min;
private readonly char _max;
public Range(char min, char max)
{
this._min = min;
this._max = max;
}
public bool IsAscii()
{
return 0 <= _min && _min < 128 &&
0 <= _max && _max < 128;
}
public bool Inside(char c)
{
return _min <= c && c <= _max;
}
}
}
/**
* The dot ('.') character set transition. This transition
* matches a single character that is not equal to a newline
* character.
*/
internal class NFADotTransition : NFATransition
{
public NFADotTransition(NFAState state) : base(state)
{
}
public override bool IsAscii()
{
return false;
}
public override bool Match(char ch)
{
switch (ch)
{
case '\n':
case '\r':
case '\u0085':
case '\u2028':
case '\u2029':
return false;
default:
return true;
}
}
public override NFATransition Copy(NFAState state)
{
return new NFADotTransition(state);
}
}
/**
* The digit character set transition. This transition matches a
* single numeric character.
*/
internal class NFADigitTransition : NFATransition
{
public NFADigitTransition(NFAState state) : base(state)
{
}
public override bool IsAscii()
{
return true;
}
public override bool Match(char ch)
{
return '0' <= ch && ch <= '9';
}
public override NFATransition Copy(NFAState state)
{
return new NFADigitTransition(state);
}
}
/**
* The non-digit character set transition. This transition
* matches a single non-numeric character.
*/
internal class NFANonDigitTransition : NFATransition
{
public NFANonDigitTransition(NFAState state) : base(state)
{
}
public override bool IsAscii()
{
return false;
}
public override bool Match(char ch)
{
return ch < '0' || '9' < ch;
}
public override NFATransition Copy(NFAState state)
{
return new NFANonDigitTransition(state);
}
}
/**
* The whitespace character set transition. This transition
* matches a single whitespace character.
*/
internal class NFAWhitespaceTransition : NFATransition
{
public NFAWhitespaceTransition(NFAState state) : base(state)
{
}
public override bool IsAscii()
{
return true;
}
public override bool Match(char ch)
{
switch (ch)
{
case ' ':
case '\t':
case '\n':
case '\f':
case '\r':
case (char)11:
return true;
default:
return false;
}
}
public override NFATransition Copy(NFAState state)
{
return new NFAWhitespaceTransition(state);
}
}
/**
* The non-whitespace character set transition. This transition
* matches a single non-whitespace character.
*/
internal class NFANonWhitespaceTransition : NFATransition
{
public NFANonWhitespaceTransition(NFAState state) : base(state)
{
}
public override bool IsAscii()
{
return false;
}
public override bool Match(char ch)
{
switch (ch)
{
case ' ':
case '\t':
case '\n':
case '\f':
case '\r':
case (char)11:
return false;
default:
return true;
}
}
public override NFATransition Copy(NFAState state)
{
return new NFANonWhitespaceTransition(state);
}
}
/**
* The word character set transition. This transition matches a
* single word character.
*/
internal class NFAWordTransition : NFATransition
{
public NFAWordTransition(NFAState state) : base(state)
{
}
public override bool IsAscii()
{
return true;
}
public override bool Match(char ch)
{
return ('a' <= ch && ch <= 'z')
|| ('A' <= ch && ch <= 'Z')
|| ('0' <= ch && ch <= '9')
|| ch == '_';
}
public override NFATransition Copy(NFAState state)
{
return new NFAWordTransition(state);
}
}
/**
* The non-word character set transition. This transition matches
* a single non-word character.
*/
internal class NFANonWordTransition : NFATransition
{
public NFANonWordTransition(NFAState state) : base(state)
{
}
public override bool IsAscii()
{
return false;
}
public override bool Match(char ch)
{
bool word = ('a' <= ch && ch <= 'z')
|| ('A' <= ch && ch <= 'Z')
|| ('0' <= ch && ch <= '9')
|| ch == '_';
return !word;
}
public override NFATransition Copy(NFAState state)
{
return new NFANonWordTransition(state);
}
}
/**
* An NFA state queue. This queue is used during processing to
* keep track of the current and subsequent NFA states. The
* current state is read from the beginning of the queue, and new
* states are added at the end. A marker index is used to
* separate the current from the subsequent states.<p>
*
* The queue implementation is optimized for quick removal at the
* beginning and addition at the end. It will attempt to use a
* fixed-size array to store the whole queue, and moves the data
* in this array only when absolutely needed. The array is also
* enlarged automatically if too many states are being processed
* at a single time.
*/
internal class NFAStateQueue
{
private NFAState[] _queue = new NFAState[2048];
private int _first = 0;
private int _last = 0;
private int _mark = 0;
public bool Empty => (_last <= _first);
public bool Marked => _first == _mark;
public void Clear()
{
_first = 0;
_last = 0;
_mark = 0;
}
public void MarkEnd()
{
_mark = _last;
}
public NFAState RemoveFirst()
{
if (_first < _last)
{
_first++;
return _queue[_first - 1];
}
else
{
return null;
}
}
public void AddLast(NFAState state)
{
if (_last >= _queue.Length)
{
if (_first <= 0)
{
Array.Resize(ref _queue, _queue.Length * 2);
}
else
{
Array.Copy(_queue, _first, _queue, 0, _last - _first);
_last -= _first;
_mark -= _first;
_first = 0;
}
}
_queue[_last++] = state;
}
}
}