Files
Flee/Parsing/Tokenizer.cs

447 lines
13 KiB
C#

using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace Flee.Parsing
{
/**
* A character stream tokenizer. This class groups the characters read
* from the stream together into tokens ("words"). The grouping is
* controlled by token patterns that contain either a fixed string to
* search for, or a regular expression. If the stream of characters
* don't match any of the token patterns, a parse exception is thrown.
*/
internal class Tokenizer
{
private bool _useTokenList = false;
private readonly StringDFAMatcher _stringDfaMatcher;
private readonly NFAMatcher _nfaMatcher;
private readonly RegExpMatcher _regExpMatcher;
private ReaderBuffer _buffer = null;
private readonly TokenMatch _lastMatch = new TokenMatch();
private Token _previousToken = null;
public Tokenizer(TextReader input)
: this(input, false)
{
}
public Tokenizer(TextReader input, bool ignoreCase)
{
this._stringDfaMatcher = new StringDFAMatcher(ignoreCase);
this._nfaMatcher = new NFAMatcher(ignoreCase);
this._regExpMatcher = new RegExpMatcher(ignoreCase);
this._buffer = new ReaderBuffer(input);
}
public bool UseTokenList
{
get
{
return _useTokenList;
}
set
{
_useTokenList = value;
}
}
public bool GetUseTokenList()
{
return _useTokenList;
}
public void SetUseTokenList(bool useTokenList)
{
this._useTokenList = useTokenList;
}
public string GetPatternDescription(int id)
{
var pattern = _stringDfaMatcher.GetPattern(id);
if (pattern == null)
{
pattern = _nfaMatcher.GetPattern(id);
}
if (pattern == null)
{
pattern = _regExpMatcher.GetPattern(id);
}
return pattern?.ToShortString();
}
public int GetCurrentLine()
{
return _buffer.LineNumber;
}
public int GetCurrentColumn()
{
return _buffer.ColumnNumber;
}
/**
* nfa - true to attempt as an nfa pattern for regexp. This handles most things except the complex repeates, ie {1,4}
*/
public void AddPattern(TokenPattern pattern, bool nfa=true)
{
switch (pattern.Type)
{
case TokenPattern.PatternType.STRING:
try
{
_stringDfaMatcher.AddPattern(pattern);
}
catch (Exception e)
{
throw new ParserCreationException(
ParserCreationException.ErrorType.INVALID_TOKEN,
pattern.Name,
"error adding string token: " +
e.Message);
}
break;
case TokenPattern.PatternType.REGEXP:
if (nfa)
{
try
{
_nfaMatcher.AddPattern(pattern);
}
catch (Exception)
{
nfa = false;
}
}
if (!nfa)
{
try
{
_regExpMatcher.AddPattern(pattern);
}
catch (Exception e)
{
throw new ParserCreationException(
ParserCreationException.ErrorType.INVALID_TOKEN,
pattern.Name,
"regular expression contains error(s): " +
e.Message);
}
}
break;
default:
throw new ParserCreationException(
ParserCreationException.ErrorType.INVALID_TOKEN,
pattern.Name,
"pattern type " + pattern.Type +
" is undefined");
}
}
public void Reset(TextReader input)
{
//this.buffer.Dispose();
this._buffer = new ReaderBuffer(input);
this._previousToken = null;
this._lastMatch.Clear();
}
public Token Next()
{
Token token = null;
do
{
token = NextToken();
if (token == null)
{
_previousToken = null;
return null;
}
if (_useTokenList)
{
token.Previous = _previousToken;
_previousToken = token;
}
if (token.Pattern.Ignore)
{
token = null;
}
else if (token.Pattern.Error)
{
throw new ParseException(
ParseException.ErrorType.INVALID_TOKEN,
token.Pattern.ErrorMessage,
token.StartLine,
token.StartColumn);
}
} while (token == null);
return token;
}
private Token NextToken()
{
try
{
_lastMatch.Clear();
_stringDfaMatcher.Match(_buffer, _lastMatch);
_nfaMatcher.Match(_buffer, _lastMatch);
_regExpMatcher.Match(_buffer, _lastMatch);
int line;
int column;
if (_lastMatch.Length > 0)
{
line = _buffer.LineNumber;
column = _buffer.ColumnNumber;
var str = _buffer.Read(_lastMatch.Length);
return NewToken(_lastMatch.Pattern, str, line, column);
}
else if (_buffer.Peek(0) < 0)
{
return null;
}
else
{
line = _buffer.LineNumber;
column = _buffer.ColumnNumber;
throw new ParseException(
ParseException.ErrorType.UNEXPECTED_CHAR,
_buffer.Read(1),
line,
column);
}
}
catch (IOException e)
{
throw new ParseException(ParseException.ErrorType.IO,
e.Message,
-1,
-1);
}
}
protected virtual Token NewToken(TokenPattern pattern,
string image,
int line,
int column)
{
return new Token(pattern, image, line, column);
}
public override string ToString()
{
StringBuilder buffer = new StringBuilder();
buffer.Append(_stringDfaMatcher);
buffer.Append(_nfaMatcher);
buffer.Append(_regExpMatcher);
return buffer.ToString();
}
}
internal abstract class TokenMatcher
{
protected TokenPattern[] Patterns = new TokenPattern[0];
protected bool IgnoreCase = false;
protected TokenMatcher(bool ignoreCase)
{
IgnoreCase = ignoreCase;
}
public abstract void Match(ReaderBuffer buffer, TokenMatch match);
public TokenPattern GetPattern(int id)
{
for (int i = 0; i < Patterns.Length; i++)
{
if (Patterns[i].Id == id)
{
return Patterns[i];
}
}
return null;
}
public virtual void AddPattern(TokenPattern pattern)
{
Array.Resize(ref Patterns, Patterns.Length + 1);
Patterns[Patterns.Length - 1] = pattern;
}
public override string ToString()
{
StringBuilder buffer = new StringBuilder();
for (int i = 0; i < Patterns.Length; i++)
{
buffer.Append(Patterns[i]);
buffer.Append("\n\n");
}
return buffer.ToString();
}
}
internal class StringDFAMatcher : TokenMatcher
{
private readonly TokenStringDFA _automaton = new TokenStringDFA();
public StringDFAMatcher(bool ignoreCase) : base(ignoreCase)
{
}
public override void AddPattern(TokenPattern pattern)
{
_automaton.AddMatch(pattern.Pattern, IgnoreCase, pattern);
base.AddPattern(pattern);
}
public override void Match(ReaderBuffer buffer, TokenMatch match)
{
TokenPattern res = _automaton.Match(buffer, IgnoreCase);
if (res != null)
{
match.Update(res.Pattern.Length, res);
}
}
}
internal class NFAMatcher : TokenMatcher
{
private readonly TokenNFA _automaton = new TokenNFA();
public NFAMatcher(bool ignoreCase) : base(ignoreCase)
{
}
public override void AddPattern(TokenPattern pattern)
{
if (pattern.Type == TokenPattern.PatternType.STRING)
{
_automaton.AddTextMatch(pattern.Pattern, IgnoreCase, pattern);
}
else
{
_automaton.AddRegExpMatch(pattern.Pattern, IgnoreCase, pattern);
}
base.AddPattern(pattern);
}
public override void Match(ReaderBuffer buffer, TokenMatch match)
{
_automaton.Match(buffer, match);
}
}
internal class RegExpMatcher : TokenMatcher
{
private REHandler[] _regExps = new REHandler[0];
public RegExpMatcher(bool ignoreCase) : base(ignoreCase)
{
}
public override void AddPattern(TokenPattern pattern)
{
REHandler re;
try
{
re = new GrammaticaRE(pattern.Pattern, IgnoreCase);
pattern.DebugInfo = "Grammatica regexp\n" + re;
}
catch (Exception)
{
re = new SystemRE(pattern.Pattern, IgnoreCase);
pattern.DebugInfo = "native .NET regexp";
}
Array.Resize(ref _regExps, _regExps.Length + 1);
_regExps[_regExps.Length - 1] = re;
base.AddPattern(pattern);
}
public override void Match(ReaderBuffer buffer, TokenMatch match)
{
for (int i = 0; i < _regExps.Length; i++)
{
int length = _regExps[i].Match(buffer);
if (length > 0)
{
match.Update(length, Patterns[i]);
}
}
}
}
internal abstract class REHandler
{
public abstract int Match(ReaderBuffer buffer);
}
internal class GrammaticaRE : REHandler
{
private readonly RegExp _regExp;
private Matcher _matcher = null;
public GrammaticaRE(string regex, bool ignoreCase)
{
_regExp = new RegExp(regex, ignoreCase);
}
public override int Match(ReaderBuffer buffer)
{
if (_matcher == null)
{
_matcher = _regExp.Matcher(buffer);
}
else
{
_matcher.Reset(buffer);
}
return _matcher.MatchFromBeginning() ? _matcher.Length() : 0;
}
}
internal class SystemRE : REHandler
{
private readonly Regex _reg;
public SystemRE(string regex, bool ignoreCase)
{
if (ignoreCase)
{
_reg = new Regex(regex, RegexOptions.IgnoreCase);
}
else
{
_reg = new Regex(regex);
}
}
public override int Match(ReaderBuffer buffer)
{
Match m;
// Ugly hack since .NET doesn't have a flag for when the
// end of the input string was encountered...
buffer.Peek(1024 * 16);
// Also, there is no API to limit the search to the specified
// position, so we double-check the index afterwards instead.
m = _reg.Match(buffer.ToString(), buffer.Position);
if (m.Success && m.Index == buffer.Position)
{
return m.Length;
}
else
{
return 0;
}
}
}
}