508 lines
15 KiB
C#
508 lines
15 KiB
C#
using System;
|
|
using System.Collections;
|
|
using System.Globalization;
|
|
using System.IO;
|
|
using System.Text;
|
|
|
|
|
|
namespace Flee.Parsing
|
|
{
|
|
/**
|
|
* A regular expression. This class creates and holds an internal
|
|
* data structure representing a regular expression. It also
|
|
* allows creating matchers. This class is thread-safe. Multiple
|
|
* matchers may operate simultanously on the same regular
|
|
* expression.
|
|
*/
|
|
internal class RegExp
|
|
{
|
|
private readonly Element _element;
|
|
private readonly string _pattern;
|
|
private readonly bool _ignoreCase;
|
|
private int _pos;
|
|
|
|
public RegExp(string pattern)
|
|
: this(pattern, false)
|
|
{
|
|
}
|
|
|
|
public RegExp(string pattern, bool ignoreCase)
|
|
{
|
|
this._pattern = pattern;
|
|
this._ignoreCase = ignoreCase;
|
|
this._pos = 0;
|
|
this._element = ParseExpr();
|
|
if (_pos < pattern.Length)
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNEXPECTED_CHARACTER,
|
|
_pos,
|
|
pattern);
|
|
}
|
|
}
|
|
|
|
public Matcher Matcher(string str)
|
|
{
|
|
return Matcher(new ReaderBuffer(new StringReader(str)));
|
|
}
|
|
|
|
public Matcher Matcher(ReaderBuffer buffer)
|
|
{
|
|
return new Matcher((Element)_element.Clone(), buffer, _ignoreCase);
|
|
}
|
|
|
|
public override string ToString()
|
|
{
|
|
var str = new StringWriter();
|
|
str.WriteLine("Regular Expression");
|
|
str.WriteLine(" Pattern: " + _pattern);
|
|
str.Write(" Flags:");
|
|
if (_ignoreCase)
|
|
{
|
|
str.Write(" caseignore");
|
|
}
|
|
str.WriteLine();
|
|
str.WriteLine(" Compiled:");
|
|
_element.PrintTo(str, " ");
|
|
return str.ToString();
|
|
}
|
|
|
|
private Element ParseExpr()
|
|
{
|
|
var first = ParseTerm();
|
|
if (PeekChar(0) != '|')
|
|
{
|
|
return first;
|
|
}
|
|
else
|
|
{
|
|
ReadChar('|');
|
|
var second = ParseExpr();
|
|
return new AlternativeElement(first, second);
|
|
}
|
|
}
|
|
|
|
private Element ParseTerm()
|
|
{
|
|
ArrayList list = new ArrayList();
|
|
|
|
list.Add(ParseFact());
|
|
while (true)
|
|
{
|
|
switch (PeekChar(0))
|
|
{
|
|
case -1:
|
|
case ')':
|
|
case ']':
|
|
case '{':
|
|
case '}':
|
|
case '?':
|
|
case '+':
|
|
case '|':
|
|
return CombineElements(list);
|
|
default:
|
|
list.Add(ParseFact());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
private Element ParseFact()
|
|
{
|
|
var elem = ParseAtom();
|
|
switch (PeekChar(0))
|
|
{
|
|
case '?':
|
|
case '*':
|
|
case '+':
|
|
case '{':
|
|
return ParseAtomModifier(elem);
|
|
default:
|
|
return elem;
|
|
}
|
|
}
|
|
|
|
private Element ParseAtom()
|
|
{
|
|
Element elem;
|
|
|
|
switch (PeekChar(0))
|
|
{
|
|
case '.':
|
|
ReadChar('.');
|
|
return CharacterSetElement.Dot;
|
|
case '(':
|
|
ReadChar('(');
|
|
elem = ParseExpr();
|
|
ReadChar(')');
|
|
return elem;
|
|
case '[':
|
|
ReadChar('[');
|
|
elem = ParseCharSet();
|
|
ReadChar(']');
|
|
return elem;
|
|
case -1:
|
|
case ')':
|
|
case ']':
|
|
case '{':
|
|
case '}':
|
|
case '?':
|
|
case '*':
|
|
case '+':
|
|
case '|':
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNEXPECTED_CHARACTER,
|
|
_pos,
|
|
_pattern);
|
|
default:
|
|
return ParseChar();
|
|
}
|
|
}
|
|
|
|
private Element ParseAtomModifier(Element elem)
|
|
{
|
|
int min = 0;
|
|
int max = -1;
|
|
RepeatElement.RepeatType type;
|
|
int firstPos;
|
|
|
|
// Read min and max
|
|
type = RepeatElement.RepeatType.GREEDY;
|
|
switch (ReadChar())
|
|
{
|
|
case '?':
|
|
min = 0;
|
|
max = 1;
|
|
break;
|
|
case '*':
|
|
min = 0;
|
|
max = -1;
|
|
break;
|
|
case '+':
|
|
min = 1;
|
|
max = -1;
|
|
break;
|
|
case '{':
|
|
firstPos = _pos - 1;
|
|
min = ReadNumber();
|
|
max = min;
|
|
if (PeekChar(0) == ',')
|
|
{
|
|
ReadChar(',');
|
|
max = -1;
|
|
if (PeekChar(0) != '}')
|
|
{
|
|
max = ReadNumber();
|
|
}
|
|
}
|
|
ReadChar('}');
|
|
if (max == 0 || (max > 0 && min > max))
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.INVALID_REPEAT_COUNT,
|
|
firstPos,
|
|
_pattern);
|
|
}
|
|
break;
|
|
default:
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNEXPECTED_CHARACTER,
|
|
_pos - 1,
|
|
_pattern);
|
|
}
|
|
|
|
// Read operator mode
|
|
if (PeekChar(0) == '?')
|
|
{
|
|
ReadChar('?');
|
|
type = RepeatElement.RepeatType.RELUCTANT;
|
|
}
|
|
else if (PeekChar(0) == '+')
|
|
{
|
|
ReadChar('+');
|
|
type = RepeatElement.RepeatType.POSSESSIVE;
|
|
}
|
|
|
|
return new RepeatElement(elem, min, max, type);
|
|
}
|
|
|
|
private Element ParseCharSet()
|
|
{
|
|
CharacterSetElement charset;
|
|
bool repeat = true;
|
|
|
|
if (PeekChar(0) == '^')
|
|
{
|
|
ReadChar('^');
|
|
charset = new CharacterSetElement(true);
|
|
}
|
|
else
|
|
{
|
|
charset = new CharacterSetElement(false);
|
|
}
|
|
|
|
while (PeekChar(0) > 0 && repeat)
|
|
{
|
|
var start = (char)PeekChar(0);
|
|
switch (start)
|
|
{
|
|
case ']':
|
|
repeat = false;
|
|
break;
|
|
case '\\':
|
|
var elem = ParseEscapeChar();
|
|
if (elem is StringElement)
|
|
{
|
|
charset.AddCharacters((StringElement)elem);
|
|
}
|
|
else
|
|
{
|
|
charset.AddCharacterSet((CharacterSetElement)elem);
|
|
}
|
|
break;
|
|
default:
|
|
ReadChar(start);
|
|
if (PeekChar(0) == '-'
|
|
&& PeekChar(1) > 0
|
|
&& PeekChar(1) != ']')
|
|
{
|
|
|
|
ReadChar('-');
|
|
var end = ReadChar();
|
|
charset.AddRange(FixChar(start), FixChar(end));
|
|
}
|
|
else
|
|
{
|
|
charset.AddCharacter(FixChar(start));
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
return charset;
|
|
}
|
|
|
|
private Element ParseChar()
|
|
{
|
|
switch (PeekChar(0))
|
|
{
|
|
case '\\':
|
|
return ParseEscapeChar();
|
|
case '^':
|
|
case '$':
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER,
|
|
_pos,
|
|
_pattern);
|
|
default:
|
|
return new StringElement(FixChar(ReadChar()));
|
|
}
|
|
}
|
|
|
|
private Element ParseEscapeChar()
|
|
{
|
|
char c;
|
|
string str;
|
|
int value;
|
|
|
|
ReadChar('\\');
|
|
c = ReadChar();
|
|
switch (c)
|
|
{
|
|
case '0':
|
|
c = ReadChar();
|
|
if (c < '0' || c > '3')
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
|
|
_pos - 3,
|
|
_pattern);
|
|
}
|
|
value = c - '0';
|
|
c = (char)PeekChar(0);
|
|
if ('0' <= c && c <= '7')
|
|
{
|
|
value *= 8;
|
|
value += ReadChar() - '0';
|
|
c = (char)PeekChar(0);
|
|
if ('0' <= c && c <= '7')
|
|
{
|
|
value *= 8;
|
|
value += ReadChar() - '0';
|
|
}
|
|
}
|
|
return new StringElement(FixChar((char)value));
|
|
case 'x':
|
|
str = ReadChar().ToString() +
|
|
ReadChar().ToString();
|
|
try
|
|
{
|
|
value = Int32.Parse(str,
|
|
NumberStyles.AllowHexSpecifier);
|
|
return new StringElement(FixChar((char)value));
|
|
}
|
|
catch (FormatException)
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
|
|
_pos - str.Length - 2,
|
|
_pattern);
|
|
}
|
|
case 'u':
|
|
str = ReadChar().ToString() +
|
|
ReadChar().ToString() +
|
|
ReadChar().ToString() +
|
|
ReadChar().ToString();
|
|
try
|
|
{
|
|
value = Int32.Parse(str,
|
|
NumberStyles.AllowHexSpecifier);
|
|
return new StringElement(FixChar((char)value));
|
|
}
|
|
catch (FormatException)
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
|
|
_pos - str.Length - 2,
|
|
_pattern);
|
|
}
|
|
case 't':
|
|
return new StringElement('\t');
|
|
case 'n':
|
|
return new StringElement('\n');
|
|
case 'r':
|
|
return new StringElement('\r');
|
|
case 'f':
|
|
return new StringElement('\f');
|
|
case 'a':
|
|
return new StringElement('\u0007');
|
|
case 'e':
|
|
return new StringElement('\u001B');
|
|
case 'd':
|
|
return CharacterSetElement.Digit;
|
|
case 'D':
|
|
return CharacterSetElement.NonDigit;
|
|
case 's':
|
|
return CharacterSetElement.Whitespace;
|
|
case 'S':
|
|
return CharacterSetElement.NonWhitespace;
|
|
case 'w':
|
|
return CharacterSetElement.Word;
|
|
case 'W':
|
|
return CharacterSetElement.NonWord;
|
|
default:
|
|
if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'))
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNSUPPORTED_ESCAPE_CHARACTER,
|
|
_pos - 2,
|
|
_pattern);
|
|
}
|
|
return new StringElement(FixChar(c));
|
|
}
|
|
}
|
|
|
|
private char FixChar(char c)
|
|
{
|
|
return _ignoreCase ? Char.ToLower(c) : c;
|
|
}
|
|
|
|
private int ReadNumber()
|
|
{
|
|
StringBuilder buf = new StringBuilder();
|
|
int c;
|
|
|
|
c = PeekChar(0);
|
|
while ('0' <= c && c <= '9')
|
|
{
|
|
buf.Append(ReadChar());
|
|
c = PeekChar(0);
|
|
}
|
|
if (buf.Length <= 0)
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNEXPECTED_CHARACTER,
|
|
_pos,
|
|
_pattern);
|
|
}
|
|
return Int32.Parse(buf.ToString());
|
|
}
|
|
|
|
private char ReadChar()
|
|
{
|
|
int c = PeekChar(0);
|
|
|
|
if (c < 0)
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNTERMINATED_PATTERN,
|
|
_pos,
|
|
_pattern);
|
|
}
|
|
else
|
|
{
|
|
_pos++;
|
|
return (char)c;
|
|
}
|
|
}
|
|
|
|
private char ReadChar(char c)
|
|
{
|
|
if (c != ReadChar())
|
|
{
|
|
throw new RegExpException(
|
|
RegExpException.ErrorType.UNEXPECTED_CHARACTER,
|
|
_pos - 1,
|
|
_pattern);
|
|
}
|
|
return c;
|
|
}
|
|
|
|
private int PeekChar(int count)
|
|
{
|
|
if (_pos + count < _pattern.Length)
|
|
{
|
|
return _pattern[_pos + count];
|
|
}
|
|
else
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
private Element CombineElements(ArrayList list)
|
|
{
|
|
Element elem;
|
|
int i;
|
|
// Concatenate string elements
|
|
var prev = (Element)list[0];
|
|
for (i = 1; i < list.Count; i++)
|
|
{
|
|
elem = (Element)list[i];
|
|
if (prev is StringElement
|
|
&& elem is StringElement)
|
|
{
|
|
|
|
var str = ((StringElement)prev).GetString() +
|
|
((StringElement)elem).GetString();
|
|
elem = new StringElement(str);
|
|
list.RemoveAt(i);
|
|
list[i - 1] = elem;
|
|
i--;
|
|
}
|
|
prev = elem;
|
|
}
|
|
|
|
// Combine all remaining elements
|
|
elem = (Element)list[list.Count - 1];
|
|
for (i = list.Count - 2; i >= 0; i--)
|
|
{
|
|
prev = (Element)list[i];
|
|
elem = new CombineElement(prev, elem);
|
|
}
|
|
|
|
return elem;
|
|
}
|
|
}
|
|
}
|