Logo Search packages:      
Sourcecode: ldc version File versions

regexp.d

// Regular Expressions

/*
 *  Copyright (C) 2000-2005 by Digital Mars, www.digitalmars.com
 *  Written by Walter Bright
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  o  The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgment in the product documentation would be
 *     appreciated but is not required.
 *  o  Altered source versions must be plainly marked as such, and must not
 *     be misrepresented as being the original software.
 *  o  This notice may not be removed or altered from any source
 *     distribution.
 */

/* NOTE: This file has been patched from the original DMD distribution to
   work with the GDC compiler.

   Modified by David Friedman, September 2004
*/

/**********************************************
 * $(LINK2 http://www.digitalmars.com/ctg/regular.html, Regular expressions)
 * are a powerful method of string pattern matching.
 * The regular expression
 * language used is the same as that commonly used, however, some of the very
 * advanced forms may behave slightly differently.
 *
 * std.regexp is designed to work only with valid UTF strings as input.
 * To validate untrusted input, use std.utf.validate().
 *
 * In the following guide, $(I pattern)[] refers to a
 * $(LINK2 http://www.digitalmars.com/ctg/regular.html, regular expression).
 * The $(I attributes)[] refers to
      a string controlling the interpretation
      of the regular expression.
      It consists of a sequence of one or more
      of the following characters:

      <table border=1 cellspacing=0 cellpadding=5>
      <caption>Attribute Characters</caption>
      $(TR $(TH Attribute) $(TH Action))
      <tr>
      $(TD $(B g))
      $(TD global; repeat over the whole input string)
      </tr>
      <tr>
      $(TD $(B i))
      $(TD case insensitive)
      </tr>
      <tr>
      $(TD $(B m))
      $(TD treat as multiple lines separated by newlines)
      </tr>
      </table>
 *
 * The $(I format)[] string has the formatting characters:
 *
 *    <table border=1 cellspacing=0 cellpadding=5>
      <caption>Formatting Characters</caption>
      $(TR $(TH Format) $(TH Replaced With))
      $(TR
      $(TD $(B $$))     $(TD $)
      )
      $(TR
      $(TD $(B $&))     $(TD The matched substring.)
      )
      $(TR
      $(TD $(B $`))     $(TD The portion of string that precedes the matched substring.)
      )
      $(TR
      $(TD $(B $'))     $(TD The portion of string that follows the matched substring.)
      )
      $(TR
      $(TD $(B $(DOLLAR))$(I n)) $(TD The $(I n)th capture, where $(I n)
                  is a single digit 1-9
                  and $$(I n) is not followed by a decimal digit.)
      )
      $(TR
      $(TD $(B $(DOLLAR))$(I nn)) $(TD The $(I nn)th capture, where $(I nn)
                  is a two-digit decimal
                  number 01-99.
                  If $(I nn)th capture is undefined or more than the number
                  of parenthesized subexpressions, use the empty
                  string instead.)
      )
      </table>

 *    Any other $ are left as is.
 *
 * References:
 *    $(LINK2 http://en.wikipedia.org/wiki/Regular_expressions, Wikipedia)
 * Macros:
 *    WIKI = StdRegexp
 *    DOLLAR = $
 */

/*
      Escape sequences:

      \nnn starts out a 1, 2 or 3 digit octal sequence,
      where n is an octal digit. If nnn is larger than
      0377, then the 3rd digit is not part of the sequence
      and is not consumed.
      For maximal portability, use exactly 3 digits.

      \xXX starts out a 1 or 2 digit hex sequence. X
      is a hex character. If the first character after the \x
      is not a hex character, the value of the sequence is 'x'
      and the XX are not consumed.
      For maximal portability, use exactly 2 digits.

      \uUUUU is a unicode sequence. There are exactly
      4 hex characters after the \u, if any are not, then
      the value of the sequence is 'u', and the UUUU are not
      consumed.

      Character classes:

      [a-b], where a is greater than b, will produce
      an error.

      References:

      http://www.unicode.org/unicode/reports/tr18/
 */

module std.regexp;

//debug = regexp;       // uncomment to turn on debugging printf's

private
{
    import std.c.stdio;
    import std.c.stdlib;
    import std.c.string;
    import std.stdio;
    import std.string;
    import std.ctype;
    import std.outbuffer;
    import std.bitarray;
    import std.utf;
    import std.intrinsic;
}

/** Regular expression to extract an _email address */
const char[] email =
    r"[a-zA-Z]([.]?([[a-zA-Z0-9_]-]+)*)?@([[a-zA-Z0-9_]\-_]+\.)+[a-zA-Z]{2,6}";

/** Regular expression to extract a _url */
const char[] url = r"(([h|H][t|T]|[f|F])[t|T][p|P]([s|S]?)\:\/\/|~/|/)?([\w]+:\w+@)?(([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?)?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?([,]\w+)*((\?\w+=\w+)?(&\w+=\w+)*([,]\w*)*)?";

/************************************
 * One of these gets thrown on compilation errors
 */

class RegExpException : Exception
{
    this(char[] msg)
    {
      super(msg);
    }
}

struct regmatch_t
{
    int rm_so;                // index of start of match
    int rm_eo;                // index past end of match
}

private alias char rchar;     // so we can make a wchar version

/******************************************************
 * Search string for matches with regular expression
 * pattern with attributes.
 * Replace each match with string generated from format.
 * Params:
 *    string = String to search.
 *    pattern = Regular expression pattern.
 *    format = Replacement string format.
 *    attributes = Regular expression attributes.
 * Returns:
 *    the resulting string
 * Example:
 *    Replace the letters 'a' with the letters 'ZZ'.
 * ---
 * s = "Strap a rocket engine on a chicken."
 * sub(s, "a", "ZZ")        // result: StrZZp a rocket engine on a chicken.
 * sub(s, "a", "ZZ", "g")   // result: StrZZp ZZ rocket engine on ZZ chicken.
 * ---
 *    The replacement format can reference the matches using
 *    the $&amp;, $$, $', $`, $0 .. $99 notation:
 * ---
 * sub(s, "[ar]", "[$&]", "g") // result: St[r][a]p [a] [r]ocket engine on [a] chi
 * ---
 */

char[] sub(char[] string, char[] pattern, char[] format, char[] attributes = null)
{
    auto r = new RegExp(pattern, attributes);
    auto result = r.replace(string, format);
    delete r;
    return result;
}

unittest
{
    debug(regexp) printf("regexp.sub.unittest\n");

    char[] r = sub("hello", "ll", "ss");
    assert(r == "hesso");
}

/*******************************************************
 * Search string for matches with regular expression
 * pattern with attributes.
 * Pass each match to delegate dg.
 * Replace each match with the return value from dg.
 * Params:
 *    string = String to search.
 *    pattern = Regular expression pattern.
 *    dg = Delegate
 *    attributes = Regular expression attributes.
 * Returns: the resulting string.
 * Example:
 * Capitalize the letters 'a' and 'r':
 * ---
 * s = "Strap a rocket engine on a chicken.";
 * sub(s, "[ar]",
 *    delegate char[] (RegExp m)
 *    {
 *         return toupper(m.match(0));
 *    },
 *    "g");    // result: StRAp A Rocket engine on A chicken.
 * ---
 */

char[] sub(char[] string, char[] pattern, char[] delegate(RegExp) dg, char[] attributes = null)
{
    auto r = new RegExp(pattern, attributes);
    rchar[] result;
    int lastindex;
    int offset;

    result = string;
    lastindex = 0;
    offset = 0;
    while (r.test(string, lastindex))
    {
      int so = r.pmatch[0].rm_so;
      int eo = r.pmatch[0].rm_eo;

      rchar[] replacement = dg(r);

      // Optimize by using std.string.replace if possible - Dave Fladebo
      rchar[] slice = result[offset + so .. offset + eo];
      if (r.attributes & RegExp.REA.global &&         // global, so replace all
          !(r.attributes & RegExp.REA.ignoreCase) &&  // not ignoring case
          !(r.attributes & RegExp.REA.multiline) &&   // not multiline
          pattern == slice)                     // simple pattern (exact match, no special characters) 
      {
          debug(regexp)
            printf("pattern: %.*s, slice: %.*s, replacement: %.*s\n",
                cast(int) pattern.length, pattern.ptr,
                cast(int) (eo-so), result.ptr + offset,
                cast(int) replacement.length, replacement.ptr);
          result = std.string.replace(result,slice,replacement);
          break;
      }

      result = replaceSlice(result, result[offset + so .. offset + eo], replacement);

      if (r.attributes & RegExp.REA.global)
      {
          offset += replacement.length - (eo - so);

          if (lastindex == eo)
            lastindex++;            // always consume some source
          else
            lastindex = eo;
      }
      else
          break;
    }
    delete r;

    return result;
}

unittest
{
    debug(regexp) printf("regexp.sub.unittest\n");

    char[] foo(RegExp r) { return "ss"; }

    char[] r = sub("hello", "ll", delegate char[](RegExp r) { return "ss"; });
    assert(r == "hesso");

    r = sub("hello", "l", delegate char[](RegExp r) { return "l"; }, "g");
    assert(r == "hello");

    auto s = sub("Strap a rocket engine on a chicken.",
             "[ar]",
               delegate char[] (RegExp m)
               {
                return std.string.toupper(m.match(0));
               },
               "g");
    assert(s == "StRAp A Rocket engine on A chicken.");
}


/*************************************************
 * Search string[] for first match with pattern[] with attributes[].
 * Params:
 *    string = String to search.
 *    pattern = Regular expression pattern.
 *    attributes = Regular expression attributes.
 * Returns:
 *    index into string[] of match if found, -1 if no match.
 * Example:
 * ---
 * auto s = "abcabcabab";
 * std.regexp.find(s, "b");    // match, returns 1
 * std.regexp.find(s, "f");    // no match, returns -1
 * ---
 */

int find(rchar[] string, char[] pattern, char[] attributes = null)
{
    int i = -1;

    auto r = new RegExp(pattern, attributes);
    if (r.test(string))
    {
      i = r.pmatch[0].rm_so;
    }
    delete r;
    return i;
}

unittest
{
    debug(regexp) printf("regexp.find.unittest\n");

    int i;
    i = find("xabcy", "abc");
    assert(i == 1);
    i = find("cba", "abc");
    assert(i == -1);
}



/*************************************************
 * Search string[] for last match with pattern[] with attributes[].
 * Params:
 *    string = String to search.
 *    pattern = Regular expression pattern.
 *    attributes = Regular expression attributes.
 * Returns:
 *    index into string[] of match if found, -1 if no match.
 * Example:
 * ---
 * auto s = "abcabcabab";
 * std.regexp.find(s, "b");    // match, returns 9
 * std.regexp.find(s, "f");    // no match, returns -1
 * ---
 */

int rfind(rchar[] string, char[] pattern, char[] attributes = null)
{
    int i = -1;
    int lastindex = 0;

    auto r = new RegExp(pattern, attributes);
    while (r.test(string, lastindex))
    {   int eo = r.pmatch[0].rm_eo;
      i = r.pmatch[0].rm_so;
      if (lastindex == eo)
          lastindex++;        // always consume some source
      else
          lastindex = eo;
    }
    delete r;
    return i;
}

unittest
{
    int i;

    debug(regexp) printf("regexp.rfind.unittest\n");
    i = rfind("abcdefcdef", "c");
    assert(i == 6);
    i = rfind("abcdefcdef", "cd");
    assert(i == 6);
    i = rfind("abcdefcdef", "x");
    assert(i == -1);
    i = rfind("abcdefcdef", "xy");
    assert(i == -1);
    i = rfind("abcdefcdef", "");
    assert(i == 10);
}


/********************************************
 * Split string[] into an array of strings, using the regular
 * expression pattern[] with attributes[] as the separator.
 * Params:
 *    string = String to search.
 *    pattern = Regular expression pattern.
 *    attributes = Regular expression attributes.
 * Returns:
 *    array of slices into string[]
 * Example:
 * ---
 * foreach (s; split("abcabcabab", "C.", "i"))
 * {
 *     writefln("s = '%s'", s);
 * }
 * // Prints:
 * // s = 'ab'
 * // s = 'b'
 * // s = 'bab'
 * ---
 */

char[][] split(char[] string, char[] pattern, char[] attributes = null)
{
    auto r = new RegExp(pattern, attributes);
    auto result = r.split(string);
    delete r;
    return result;
}

unittest
{
    debug(regexp) printf("regexp.split.unittest()\n");
    char[][] result;

    result = split("ab", "a*");
    assert(result.length == 2);
    assert(result[0] == "");
    assert(result[1] == "b");

    foreach (i, s; split("abcabcabab", "C.", "i"))
    {
      writefln("s[%d] = '%s'", i, s);
      if (i == 0) assert(s == "ab");
      else if (i == 1) assert(s == "b");
      else if (i == 2) assert(s == "bab");
      else assert(0);
    }
}

/****************************************************
 * Search string[] for first match with pattern[] with attributes[].
 * Params:
 *    string = String to search.
 *    pattern = Regular expression pattern.
 *    attributes = Regular expression attributes.
 * Returns:
 *    corresponding RegExp if found, null if not.
 * Example:
 * ---
 * import std.stdio;
 * import std.regexp;
 *
 * void main()
 * {
 *     if (auto m = std.regexp.search("abcdef", "c"))
 *     {
 *         writefln("%s[%s]%s", m.pre, m.match(0), m.post);
 *     }
 * }
 * // Prints:
 * // ab[c]def
 * ---
 */

RegExp search(char[] string, char[] pattern, char[] attributes = null)
{
    auto r = new RegExp(pattern, attributes);

    if (r.test(string))
    {
    }
    else
    { delete r;
      r = null;
    }
    return r;
}

unittest
{
    debug(regexp) printf("regexp.string.unittest()\n");

    if (auto m = std.regexp.search("abcdef", "c()"))
    {
      auto result = std.string.format("%s[%s]%s", m.pre, m.match(0), m.post);
      assert(result == "ab[c]def");
      assert(m.match(1) == null);
      assert(m.match(2) == null);
    }
    else
      assert(0);

    if (auto n = std.regexp.search("abcdef", "g"))
    {
      assert(0);
    }
}


/* ********************************* RegExp ******************************** */

/*****************************
 * RegExp is a class to handle regular expressions.
 *
 * It is the core foundation for adding powerful string pattern matching
 * capabilities to programs like grep, text editors, awk, sed, etc.
 */
class RegExp
{
    /*****
     * Construct a RegExp object. Compile pattern
     * with <i>attributes</i> into
     * an internal form for fast execution.
     * Params:
     *      pattern = regular expression
     *  attributes = _attributes
     * Throws: RegExpException if there are any compilation errors.
     * Example:
     *  Declare two variables and assign to them a RegExp object:
     * ---
     * auto r = new RegExp("pattern");
     * auto s = new RegExp(r"p[1-5]\s*");
     * ---
     */
    public this(rchar[] pattern, rchar[] attributes = null)
    {
      pmatch = (&gmatch)[0 .. 1];
      compile(pattern, attributes);
    }

    /*****
     * Generate instance of RegExp.
     * Params:
     *      pattern = regular expression
     *  attributes = _attributes
     * Throws: RegExpException if there are any compilation errors.
     * Example:
     *  Declare two variables and assign to them a RegExp object:
     * ---
     * auto r = RegExp("pattern");
     * auto s = RegExp(r"p[1-5]\s*");
     * ---
     */
    public static RegExp opCall(rchar[] pattern, rchar[] attributes = null)
    {
      return new RegExp(pattern, attributes);
    }

    unittest
    {
      debug(regexp) printf("regexp.opCall.unittest()\n");
      auto r1 = RegExp("hello", "m");
      char[] msg;
      try
      {
          auto r2 = RegExp("hello", "q");
          assert(0);
      }
      catch (RegExpException ree)
      {
          msg = ree.toString();
          //writefln("message: %s", ree);
      }
      assert(msg == "unrecognized attribute");
    }

    /************************************
     * Set up for start of foreach loop.
     * Returns:
     *      search() returns instance of RegExp set up to _search string[].
     * Example:
     * ---
     * import std.stdio;
     * import std.regexp;
     *
     * void main()
     * {
     *     foreach(m; RegExp("ab").search("abcabcabab"))
     *     {
     *         writefln("%s[%s]%s", m.pre, m.match(0), m.post);
     *     }
     * }
     * // Prints:
     * // [ab]cabcabab
     * // abc[ab]cabab
     * // abcabc[ab]ab
     * // abcabcab[ab]
     * ---
     */

    public RegExp search(rchar[] string)
    {
      input = string;
      pmatch[0].rm_eo = 0;
      return this;
    }

    /** ditto */
    public int opApply(int delegate(inout RegExp) dg)
    {
      int result;
      RegExp r = this;

      while (test())
      {
          result = dg(r);
          if (result)
            break;
      }

      return result;
    }

    unittest
    {
      debug(regexp) printf("regexp.search.unittest()\n");

      int i;
      foreach(m; RegExp("ab").search("abcabcabab"))
      {
          auto s = std.string.format("%s[%s]%s", m.pre, m.match(0), m.post);
          if (i == 0) assert(s == "[ab]cabcabab");
          else if (i == 1) assert(s == "abc[ab]cabab");
          else if (i == 2) assert(s == "abcabc[ab]ab");
          else if (i == 3) assert(s == "abcabcab[ab]");
          else assert(0);
          i++;
      }
    }

    /******************
     * Retrieve match n.
     *
     * n==0 means the matched substring, n>0 means the
     * n'th parenthesized subexpression.
     * if n is larger than the number of parenthesized subexpressions,
     * null is returned.
     */
    public char[] match(size_t n)
    {
      if (n >= pmatch.length)
          return null;
      else
      {   size_t rm_so, rm_eo;
          rm_so = pmatch[n].rm_so;
          rm_eo = pmatch[n].rm_eo;
          if (rm_so == rm_eo)
            return null;
          return input[rm_so .. rm_eo];
      }
    }

    /*******************
     * Return the slice of the input that precedes the matched substring.
     */
    public char[] pre()
    {
      return input[0 .. pmatch[0].rm_so];
    }

    /*******************
     * Return the slice of the input that follows the matched substring.
     */
    public char[] post()
    {
      return input[pmatch[0].rm_eo .. $];
    }

    uint re_nsub;       // number of parenthesized subexpression matches
    regmatch_t[] pmatch;      // array [re_nsub + 1]

    rchar[] input;            // the string to search

    // per instance:

    rchar[] pattern;          // source text of the regular expression

    rchar[] flags;            // source text of the attributes parameter

    int errors;

    uint attributes;

    enum REA
    {
      global            = 1,  // has the g attribute
      ignoreCase  = 2,  // has the i attribute
      multiline   = 4,  // if treat as multiple lines separated
                        // by newlines, or as a single line
      dotmatchlf  = 8,  // if . matches \n
    }


private:
    size_t src;               // current source index in input[]
    size_t src_start;         // starting index for match in input[]
    size_t p;                 // position of parser in pattern[]
    regmatch_t gmatch;        // match for the entire regular expression
                        // (serves as storage for pmatch[0])

    ubyte[] program;          // pattern[] compiled into regular expression program
    OutBuffer buf;




/******************************************/

// Opcodes

enum : ubyte
{
    REend,        // end of program
    REchar,       // single character
    REichar,            // single character, case insensitive
    REdchar,            // single UCS character
    REidchar,           // single wide character, case insensitive
    REanychar,          // any character
    REanystar,          // ".*"
    REstring,           // string of characters
    REistring,          // string of characters, case insensitive
    REtestbit,          // any in bitmap, non-consuming
    REbit,        // any in the bit map
    REnotbit,           // any not in the bit map
    RErange,            // any in the string
    REnotrange,         // any not in the string
    REor,         // a | b
    REplus,       // 1 or more
    REstar,       // 0 or more
    REquest,            // 0 or 1
    REnm,         // n..m
    REnmq,        // n..m, non-greedy version
    REbol,        // beginning of line
    REeol,        // end of line
    REparen,            // parenthesized subexpression
    REgoto,       // goto offset

    REwordboundary,
    REnotwordboundary,
    REdigit,
    REnotdigit,
    REspace,
    REnotspace,
    REword,
    REnotword,
    REbackref,
};

// BUG: should this include '$'?
private int isword(dchar c) { return isalnum(c) || c == '_'; }

private uint inf = ~0u;

/* ********************************
 * Throws RegExpException on error
 */

public void compile(rchar[] pattern, rchar[] attributes)
{
    //printf("RegExp.compile('%.*s', '%.*s')\n", pattern, attributes);

    this.attributes = 0;
    foreach (rchar c; attributes)
    {   REA att;

      switch (c)
      {
          case 'g': att = REA.global;           break;
          case 'i': att = REA.ignoreCase; break;
          case 'm': att = REA.multiline;  break;
          default:
            error("unrecognized attribute");
            return;
      }
      if (this.attributes & att)
      {   error("redundant attribute");
          return;
      }
      this.attributes |= att;
    }

    input = null;

    this.pattern = pattern;
    this.flags = attributes;

    uint oldre_nsub = re_nsub;
    re_nsub = 0;
    errors = 0;

    buf = new OutBuffer();
    buf.reserve(pattern.length * 8);
    p = 0;
    parseRegexp();
    if (p < pattern.length)
    { error("unmatched ')'");
    }
    optimize();
    program = buf.data;
    buf.data = null;
    delete buf;

    if (re_nsub > oldre_nsub)
    {
      if (pmatch.ptr is &gmatch)
          pmatch = null;
      pmatch.length = re_nsub + 1;
    }
    pmatch[0].rm_so = 0;
    pmatch[0].rm_eo = 0;
}

/********************************************
 * Split string[] into an array of strings, using the regular
 * expression as the separator.
 * Returns:
 *    array of slices into string[]
 */

public rchar[][] split(rchar[] string)
{
    debug(regexp) printf("regexp.split()\n");

    rchar[][] result;

    if (string.length)
    {
      int p = 0;
      int q;
      for (q = p; q != string.length;)
      {
          if (test(string, q))
          { int e;

            q = pmatch[0].rm_so;
            e = pmatch[0].rm_eo;
            if (e != p)
            {
                result ~= string[p .. q];
                for (int i = 1; i < pmatch.length; i++)
                {
                  int so = pmatch[i].rm_so;
                  int eo = pmatch[i].rm_eo;
                  if (so == eo)
                  {   so = 0; // -1 gives array bounds error
                      eo = 0;
                  }
                  result ~= string[so .. eo];
                }
                q = p = e;
                continue;
            }
          }
          q++;
      }
      result ~= string[p .. string.length];
    }
    else if (!test(string))
      result ~= string;
    return result;
}

unittest
{
    debug(regexp) printf("regexp.split.unittest()\n");

    auto r = new RegExp("a*?", null);
    rchar[][] result;
    rchar[] j;
    int i;

    result = r.split("ab");

    assert(result.length == 2);
    i = std.string.cmp(result[0], "a");
    assert(i == 0);
    i = std.string.cmp(result[1], "b");
    assert(i == 0);

    r = new RegExp("a*", null);
    result = r.split("ab");
    assert(result.length == 2);
    i = std.string.cmp(result[0], "");
    assert(i == 0);
    i = std.string.cmp(result[1], "b");
    assert(i == 0);

    r = new RegExp("<(\\/)?([^<>]+)>", null);
    result = r.split("a<b>font</b>bar<TAG>hello</TAG>");

    for (i = 0; i < result.length; i++)
    {
      //debug(regexp) printf("result[%d] = '%.*s'\n", i, result[i]);
    }

    j = join(result, ",");
    //printf("j = '%.*s'\n", j);
    i = std.string.cmp(j, "a,,b,font,/,b,bar,,TAG,hello,/,TAG,");
    assert(i == 0);

    r = new RegExp("a[bc]", null);
    result = r.match("123ab");
    j = join(result, ",");
    i = std.string.cmp(j, "ab");
    assert(i == 0);
    
    result = r.match("ac");
    j = join(result, ",");
    i = std.string.cmp(j, "ac");
    assert(i == 0);
}

/*************************************************
 * Search string[] for match with regular expression.
 * Returns:
 *    index of match if successful, -1 if not found
 */

public int find(rchar[] string)
{
    int i;

    i = test(string);
    if (i)
      i = pmatch[0].rm_so;
    else
      i = -1;                 // no match
    return i;
}

//deprecated alias find search;

unittest
{
    debug(regexp) printf("regexp.find.unittest()\n");

    int i;
    RegExp r = new RegExp("abc", null);
    i = r.find("xabcy");
    assert(i == 1);
    i = r.find("cba");
    assert(i == -1);
}


/*************************************************
 * Search string[] for match.
 * Returns:
 *    If global attribute, return same value as exec(string).
 *    If not global attribute, return array of all matches.
 */

public rchar[][] match(rchar[] string)
{
    rchar[][] result;

    if (attributes & REA.global)
    {
      int lastindex = 0;

      while (test(string, lastindex))
      {   int eo = pmatch[0].rm_eo;

          result ~= input[pmatch[0].rm_so .. eo];
          if (lastindex == eo)
            lastindex++;            // always consume some source
          else
            lastindex = eo;
      }
    }
    else
    {
      result = exec(string);
    }
    return result;
}

unittest
{
    debug(regexp) printf("regexp.match.unittest()\n");

    int i;
    rchar[][] result;
    rchar[] j;
    RegExp r;

    r = new RegExp("a[bc]", null);
    result = r.match("1ab2ac3");
    j = join(result, ",");
    i = std.string.cmp(j, "ab");
    assert(i == 0);

    r = new RegExp("a[bc]", "g");
    result = r.match("1ab2ac3");
    j = join(result, ",");
    i = std.string.cmp(j, "ab,ac");
    assert(i == 0);
}


/*************************************************
 * Find regular expression matches in string[]. Replace those matches
 * with a new _string composed of format[] merged with the result of the
 * matches.
 * If global, replace all matches. Otherwise, replace first match.
 * Returns: the new _string
 */

public rchar[] replace(rchar[] string, rchar[] format)
{
    rchar[] result;
    int lastindex;
    int offset;

    result = string;
    lastindex = 0;
    offset = 0;
    for (;;)
    {
      if (!test(string, lastindex))
          break;

      int so = pmatch[0].rm_so;
      int eo = pmatch[0].rm_eo;

      rchar[] replacement = replace(format);

      // Optimize by using std.string.replace if possible - Dave Fladebo
      rchar[] slice = result[offset + so .. offset + eo];
      if (attributes & REA.global &&            // global, so replace all
         !(attributes & REA.ignoreCase) &&      // not ignoring case
         !(attributes & REA.multiline) && // not multiline
         pattern == slice &&              // simple pattern (exact match, no special characters) 
         format == replacement)           // simple format, not $ formats
      {
          debug(regexp)
            printf("pattern: %.*s, slice: %.*s, format: %.*s, replacement: %.*s\n",
                cast(int) pattern.length, pattern.ptr,
                cast(int) (eo-so), result.ptr + offset,
                cast(int) format.length, format.ptr,
                cast(int) replacement.length, replacement.ptr);
          result = std.string.replace(result,slice,replacement);
          break;
      }

      result = replaceSlice(result, result[offset + so .. offset + eo], replacement);

      if (attributes & REA.global)
      {
          offset += replacement.length - (eo - so);

          if (lastindex == eo)
            lastindex++;            // always consume some source
          else
            lastindex = eo;
      }
      else
          break;
    }

    return result;
}

unittest
{
    debug(regexp) printf("regexp.replace.unittest()\n");

    int i;
    rchar[] result;
    RegExp r;

    r = new RegExp("a[bc]", "g");
    result = r.replace("1ab2ac3", "x$&y");
    i = std.string.cmp(result, "1xaby2xacy3");
    assert(i == 0);

    r = new RegExp("ab", "g");
    result = r.replace("1ab2ac3", "xy");
    i = std.string.cmp(result, "1xy2ac3");
    assert(i == 0);
}


/*************************************************
 * Search string[] for match.
 * Returns:
 *    array of slices into string[] representing matches
 */

public rchar[][] exec(rchar[] string)
{
    debug(regexp) printf("regexp.exec(string = '%.*s')\n",
      cast(int) string.length, string.ptr);
    input = string;
    pmatch[0].rm_so = 0;
    pmatch[0].rm_eo = 0;
    return exec();
}

/*************************************************
 * Pick up where last exec(string) or exec() left off,
 * searching string[] for next match.
 * Returns:
 *    array of slices into string[] representing matches
 */

public rchar[][] exec()
{
    if (!test())
      return null;

    auto result = new rchar[][pmatch.length];
    for (int i = 0; i < pmatch.length; i++)
    {
      if (pmatch[i].rm_so == pmatch[i].rm_eo)
          result[i] = null;
      else
          result[i] = input[pmatch[i].rm_so .. pmatch[i].rm_eo];
    }

    return result;
}

/************************************************
 * Search string[] for match.
 * Returns: 0 for no match, !=0 for match
 * Example:
---
import std.stdio;
import std.regexp;
import std.string;

int grep(int delegate(char[]) pred, char[][] list)
{
  int count;
  foreach (s; list)
  {  if (pred(s))
       ++count;
  }
  return count;
}

void main()
{
  auto x = grep(&RegExp("[Ff]oo").test,
                std.string.split("mary had a foo lamb"));
  writefln(x);
}
---
 * which prints: 1
 */

public int test(rchar[] string)
{
    return test(string, 0 /*pmatch[0].rm_eo*/);
}

/************************************************
 * Pick up where last test(string) or test() left off, and search again.
 * Returns: 0 for no match, !=0 for match
 */

public int test()
{
    return test(input, pmatch[0].rm_eo);
}

/************************************************
 * Test string[] starting at startindex against regular expression.
 * Returns: 0 for no match, !=0 for match
 */

public int test(char[] string, int startindex)
{
    char firstc;
    uint si;

    input = string;
    debug (regexp) printf("RegExp.test(input[] = '%.*s', startindex = %d)\n",
      cast(int) input.length, input.ptr, startindex);
    pmatch[0].rm_so = 0;
    pmatch[0].rm_eo = 0;
    if (startindex < 0 || startindex > input.length)
    {
      return 0;               // fail
    }
    //debug(regexp) printProgram(program);

    // First character optimization
    firstc = 0;
    if (program[0] == REchar)
    {
      firstc = program[1];
      if (attributes & REA.ignoreCase && isalpha(firstc))
          firstc = 0;
    }

    for (si = startindex; ; si++)
    {
      if (firstc)
      {
          if (si == input.length)
            break;                  // no match
          if (input[si] != firstc)
          {
            si++;
            if (!chr(si, firstc))   // if first character not found
                break;        // no match
          }
      }
      for (int i = 0; i < re_nsub + 1; i++)
      {
          pmatch[i].rm_so = -1;
          pmatch[i].rm_eo = -1;
      }
      src_start = src = si;
      if (trymatch(0, program.length))
      {
          pmatch[0].rm_so = si;
          pmatch[0].rm_eo = src;
          //debug(regexp) printf("start = %d, end = %d\n", gmatch.rm_so, gmatch.rm_eo);
          return 1;
      }
      // If possible match must start at beginning, we are done
      if (program[0] == REbol || program[0] == REanystar)
      {
          if (attributes & REA.multiline)
          {
            // Scan for the next \n
            if (!chr(si, '\n'))
                break;        // no match if '\n' not found
          }
          else
            break;
      }
      if (si == input.length)
          break;
      //debug(regexp) printf("Starting new try: '%.*s'\n", input[si + 1 .. input.length]);
    }
    return 0;           // no match
}

int chr(inout uint si, rchar c)
{
    for (; si < input.length; si++)
    {
      if (input[si] == c)
          return 1;
    }
    return 0;
}


void printProgram(ubyte[] prog)
{
  //debug(regexp)
  {
    uint pc;
    uint len;
    uint n;
    uint m;
    ushort *pu;
    uint *puint;
    ubyte[] s;

    printf("printProgram()\n");
    for (pc = 0; pc < prog.length; )
    {
      printf("%3d: ", pc);

      //printf("prog[pc] = %d, REchar = %d, REnmq = %d\n", prog[pc], REchar, REnmq);
      switch (prog[pc])
      {
          case REchar:
            printf("\tREchar '%c'\n", prog[pc + 1]);
            pc += 1 + char.sizeof;
            break;

          case REichar:
            printf("\tREichar '%c'\n", prog[pc + 1]);
            pc += 1 + char.sizeof;
            break;

          case REdchar:
            printf("\tREdchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
            pc += 1 + dchar.sizeof;
            break;

          case REidchar:
            printf("\tREidchar '%c'\n", *cast(dchar *)&prog[pc + 1]);
            pc += 1 + dchar.sizeof;
            break;

          case REanychar:
            printf("\tREanychar\n");
            pc++;
            break;

          case REstring:
            len = *cast(uint *)&prog[pc + 1];
            s = (&prog[pc + 1 + uint.sizeof])[0 .. len];
            printf("\tREstring x%x, '%.*s'\n", len,
                cast(int) s.length, s.ptr);
            pc += 1 + uint.sizeof + len * rchar.sizeof;
            break;

          case REistring:
            len = *cast(uint *)&prog[pc + 1];
            s = (&prog[pc + 1 + uint.sizeof])[0 .. len];
            printf("\tREistring x%x, '%.*s'\n", len,
                cast(int) s.length, s.ptr);
            pc += 1 + uint.sizeof + len * rchar.sizeof;
            break;

          case REtestbit:
            pu = cast(ushort *)&prog[pc + 1];
            printf("\tREtestbit %d, %d\n", pu[0], pu[1]);
            len = pu[1];
            pc += 1 + 2 * ushort.sizeof + len;
            break;

          case REbit:
            pu = cast(ushort *)&prog[pc + 1];
            len = pu[1];
            printf("\tREbit cmax=%02x, len=%d:", pu[0], len);
            for (n = 0; n < len; n++)
                printf(" %02x", prog[pc + 1 + 2 * ushort.sizeof + n]);
            printf("\n");
            pc += 1 + 2 * ushort.sizeof + len;
            break;

          case REnotbit:
            pu = cast(ushort *)&prog[pc + 1];
            printf("\tREnotbit %d, %d\n", pu[0], pu[1]);
            len = pu[1];
            pc += 1 + 2 * ushort.sizeof + len;
            break;

          case RErange:
            len = *cast(uint *)&prog[pc + 1];
            printf("\tRErange %d\n", len);
            // BUG: REAignoreCase?
            pc += 1 + uint.sizeof + len;
            break;

          case REnotrange:
            len = *cast(uint *)&prog[pc + 1];
            printf("\tREnotrange %d\n", len);
            // BUG: REAignoreCase?
            pc += 1 + uint.sizeof + len;
            break;

          case REbol:
            printf("\tREbol\n");
            pc++;
            break;

          case REeol:
            printf("\tREeol\n");
            pc++;
            break;

          case REor:
            len = *cast(uint *)&prog[pc + 1];
            printf("\tREor %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
            pc += 1 + uint.sizeof;
            break;

          case REgoto:
            len = *cast(uint *)&prog[pc + 1];
            printf("\tREgoto %d, pc=>%d\n", len, pc + 1 + uint.sizeof + len);
            pc += 1 + uint.sizeof;
            break;

          case REanystar:
            printf("\tREanystar\n");
            pc++;
            break;

          case REnm:
          case REnmq:
            // len, n, m, ()
            puint = cast(uint *)&prog[pc + 1];
            len = puint[0];
            n = puint[1];
            m = puint[2];
            printf("\tREnm%s len=%d, n=%u, m=%u, pc=>%d\n",
                (prog[pc] == REnmq) ? cast(char*)"q" : cast(char*)" ",
                len, n, m, pc + 1 + uint.sizeof * 3 + len);
            pc += 1 + uint.sizeof * 3;
            break;

          case REparen:
            // len, n, ()
            puint = cast(uint *)&prog[pc + 1];
            len = puint[0];
            n = puint[1];
            printf("\tREparen len=%d n=%d, pc=>%d\n", len, n, pc + 1 + uint.sizeof * 2 + len);
            pc += 1 + uint.sizeof * 2;
            break;

          case REend:
            printf("\tREend\n");
            return;

          case REwordboundary:
            printf("\tREwordboundary\n");
            pc++;
            break;

          case REnotwordboundary:
            printf("\tREnotwordboundary\n");
            pc++;
            break;

          case REdigit:
            printf("\tREdigit\n");
            pc++;
            break;

          case REnotdigit:
            printf("\tREnotdigit\n");
            pc++;
            break;

          case REspace:
            printf("\tREspace\n");
            pc++;
            break;

          case REnotspace:
            printf("\tREnotspace\n");
            pc++;
            break;

          case REword:
            printf("\tREword\n");
            pc++;
            break;

          case REnotword:
            printf("\tREnotword\n");
            pc++;
            break;

          case REbackref:
            printf("\tREbackref %d\n", prog[1]);
            pc += 2;
            break;

          default:
            assert(0);
      }
    }
  }
}


/**************************************************
 * Match input against a section of the program[].
 * Returns:
 *    1 if successful match
 *    0 no match
 */

int trymatch(int pc, int pcend)
{   int srcsave;
    uint len;
    uint n;
    uint m;
    uint count;
    uint pop;
    uint ss;
    regmatch_t *psave;
    uint c1;
    uint c2;
    ushort* pu;
    uint* puint;

    debug(regexp)
    {
      char[] s = input[src .. input.length];
      printf("RegExp.trymatch(pc = %d, src = '%.*s', pcend = %d)\n",
          pc, cast(int) s.length, s.ptr, pcend);
    }
    srcsave = src;
    psave = null;
    for (;;)
    {
      if (pc == pcend)        // if done matching
      {   debug(regex) printf("\tprogend\n");
          return 1;
      }

      //printf("\top = %d\n", program[pc]);
      switch (program[pc])
      {
          case REchar:
            if (src == input.length)
                goto Lnomatch;
            debug(regexp) printf("\tREchar '%c', src = '%c'\n", program[pc + 1], input[src]);
            if (program[pc + 1] != input[src])
                goto Lnomatch;
            src++;
            pc += 1 + char.sizeof;
            break;

          case REichar:
            if (src == input.length)
                goto Lnomatch;
            debug(regexp) printf("\tREichar '%c', src = '%c'\n", program[pc + 1], input[src]);
            c1 = program[pc + 1];
            c2 = input[src];
            if (c1 != c2)
            {
                if (islower(cast(rchar)c2))
                  c2 = std.ctype.toupper(cast(rchar)c2);
                else
                  goto Lnomatch;
                if (c1 != c2)
                  goto Lnomatch;
            }
            src++;
            pc += 1 + char.sizeof;
            break;

          case REdchar:
            debug(regexp) printf("\tREdchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
            if (src == input.length)
                goto Lnomatch;
            if (*(cast(dchar *)&program[pc + 1]) != input[src])
                goto Lnomatch;
            src++;
            pc += 1 + dchar.sizeof;
            break;

          case REidchar:
            debug(regexp) printf("\tREidchar '%c', src = '%c'\n", *(cast(dchar *)&program[pc + 1]), input[src]);
            if (src == input.length)
                goto Lnomatch;
            c1 = *(cast(dchar *)&program[pc + 1]);
            c2 = input[src];
            if (c1 != c2)
            {
                if (islower(cast(rchar)c2))
                  c2 = std.ctype.toupper(cast(rchar)c2);
                else
                  goto Lnomatch;
                if (c1 != c2)
                  goto Lnomatch;
            }
            src++;
            pc += 1 + dchar.sizeof;
            break;

          case REanychar:
            debug(regexp) printf("\tREanychar\n");
            if (src == input.length)
                goto Lnomatch;
            if (!(attributes & REA.dotmatchlf) && input[src] == cast(rchar)'\n')
                goto Lnomatch;
            src += std.utf.stride(input, src);
            //src++;
            pc++;
            break;

          case REstring:
            len = *cast(uint *)&program[pc + 1];
            debug(regexp)
            {
                char[] s = (&program[pc + 1 + uint.sizeof])[0 .. len];
                printf("\tREstring x%x, '%.*s'\n", len,
                  cast(int) s.length, s.ptr);
            }
            if (src + len > input.length)
                goto Lnomatch;
            if (memcmp(&program[pc + 1 + uint.sizeof], &input[src], len * rchar.sizeof))
                goto Lnomatch;
            src += len;
            pc += 1 + uint.sizeof + len * rchar.sizeof;
            break;

          case REistring:
            len = *cast(uint *)&program[pc + 1];
            debug(regexp)
            {
                char[] s = (&program[pc + 1 + uint.sizeof])[0 .. len];
                printf("\tREistring x%x, '%.*s'\n", len,
                  cast(int) s.length, s.ptr);
            }
            if (src + len > input.length)
                goto Lnomatch;
            version (Win32)
            {
                if (memicmp(cast(char*)&program[pc + 1 + uint.sizeof], &input[src], len * rchar.sizeof))
                  goto Lnomatch;
            }
            else
            {
                if (icmp((cast(char*)&program[pc + 1 + uint.sizeof])[0..len],
                       input[src .. src + len]))
                  goto Lnomatch;
            }
            src += len;
            pc += 1 + uint.sizeof + len * rchar.sizeof;
            break;

          case REtestbit:
            pu = (cast(ushort *)&program[pc + 1]);
            debug(regexp) printf("\tREtestbit %d, %d, '%c', x%02x\n",
                pu[0], pu[1], input[src], input[src]);
            if (src == input.length)
                goto Lnomatch;
            len = pu[1];
            c1 = input[src];
            //printf("[x%02x]=x%02x, x%02x\n", c1 >> 3, ((&program[pc + 1 + 4])[c1 >> 3] ), (1 << (c1 & 7)));
            if (c1 <= pu[0] &&
                !bt(cast(uint*)&(program[pc + 1 + 4]), c1)) // assumes BitArray implementation
                goto Lnomatch;
            pc += 1 + 2 * ushort.sizeof + len;
            break;

          case REbit:
            pu = (cast(ushort *)&program[pc + 1]);
            debug(regexp) printf("\tREbit %d, %d, '%c'\n",
                pu[0], pu[1], input[src]);
            if (src == input.length)
                goto Lnomatch;
            len = pu[1];
            c1 = input[src];
            if (c1 > pu[0])
                goto Lnomatch;
            if (!bt(cast(uint*)&(program[pc + 1 + 4]), c1)) // assumes BitArray implementation
                goto Lnomatch;
            src++;
            pc += 1 + 2 * ushort.sizeof + len;
            break;

          case REnotbit:
            pu = (cast(ushort *)&program[pc + 1]);
            debug(regexp) printf("\tREnotbit %d, %d, '%c'\n",
                pu[0], pu[1], input[src]);
            if (src == input.length)
                goto Lnomatch;
            len = pu[1];
            c1 = input[src];
            if (c1 <= pu[0] &&
                bt(cast(uint*)&(program[pc + 1 + 4]), c1)) // assumes BitArray implementation
                goto Lnomatch;
            src++;
            pc += 1 + 2 * ushort.sizeof + len;
            break;

          case RErange:
            len = *cast(uint *)&program[pc + 1];
            debug(regexp) printf("\tRErange %d\n", len);
            if (src == input.length)
                goto Lnomatch;
            // BUG: REA.ignoreCase?
            if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) == null)
                goto Lnomatch;
            src++;
            pc += 1 + uint.sizeof + len;
            break;

          case REnotrange:
            len = *cast(uint *)&program[pc + 1];
            debug(regexp) printf("\tREnotrange %d\n", len);
            if (src == input.length)
                goto Lnomatch;
            // BUG: REA.ignoreCase?
            if (memchr(cast(char*)&program[pc + 1 + uint.sizeof], input[src], len) != null)
                goto Lnomatch;
            src++;
            pc += 1 + uint.sizeof + len;
            break;

          case REbol:
            debug(regexp) printf("\tREbol\n");
            if (src == 0)
            {
            }
            else if (attributes & REA.multiline)
            {
                if (input[src - 1] != '\n')
                  goto Lnomatch;
            }
            else
                goto Lnomatch;
            pc++;
            break;

          case REeol:
            debug(regexp) printf("\tREeol\n");
            if (src == input.length)
            {
            }
            else if (attributes & REA.multiline && input[src] == '\n')
                src++;
            else
                goto Lnomatch;
            pc++;
            break;

          case REor:
            len = (cast(uint *)&program[pc + 1])[0];
            debug(regexp) printf("\tREor %d\n", len);
            pop = pc + 1 + uint.sizeof;
            ss = src;
            if (trymatch(pop, pcend))
            {
                if (pcend != program.length)
                { int s;

                  s = src;
                  if (trymatch(pcend, program.length))
                  {   debug(regexp) printf("\tfirst operand matched\n");
                      src = s;
                      return 1;
                  }
                  else
                  {
                      // If second branch doesn't match to end, take first anyway
                      src = ss;
                      if (!trymatch(pop + len, program.length))
                      {
                        debug(regexp) printf("\tfirst operand matched\n");
                        src = s;
                        return 1;
                      }
                  }
                  src = ss;
                }
                else
                { debug(regexp) printf("\tfirst operand matched\n");
                  return 1;
                }
            }
            pc = pop + len;         // proceed with 2nd branch
            break;

          case REgoto:
            debug(regexp) printf("\tREgoto\n");
            len = (cast(uint *)&program[pc + 1])[0];
            pc += 1 + uint.sizeof + len;
            break;

          case REanystar:
            debug(regexp) printf("\tREanystar\n");
            pc++;
            for (;;)
            {   int s1;
                int s2;

                s1 = src;
                if (src == input.length)
                  break;
                if (!(attributes & REA.dotmatchlf) && input[src] == '\n')
                  break;
                src++;
                s2 = src;

                // If no match after consumption, but it
                // did match before, then no match
                if (!trymatch(pc, program.length))
                {
                  src = s1;
                  // BUG: should we save/restore pmatch[]?
                  if (trymatch(pc, program.length))
                  {
                      src = s1;           // no match
                      break;
                  }
                }
                src = s2;
            }
            break;

          case REnm:
          case REnmq:
            // len, n, m, ()
            puint = cast(uint *)&program[pc + 1];
            len = puint[0];
            n = puint[1];
            m = puint[2];
            debug(regexp) printf("\tREnm%s len=%d, n=%u, m=%u\n", (program[pc] == REnmq) ? cast(char*)"q" : cast(char*)"", len, n, m);
            pop = pc + 1 + uint.sizeof * 3;
            for (count = 0; count < n; count++)
            {
                if (!trymatch(pop, pop + len))
                  goto Lnomatch;
            }
            if (!psave && count < m)
            {
                //version (Win32)
                  psave = cast(regmatch_t *)/*alloca*/malloc((re_nsub + 1) * regmatch_t.sizeof);
                //else
                  //psave = new regmatch_t[re_nsub + 1];
            }
            if (program[pc] == REnmq)     // if minimal munch
            {
                for (; count < m; count++)
                {   int s1;

                  memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
                  s1 = src;

                  if (trymatch(pop + len, program.length))
                  {
                      src = s1;
                      memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
                      break;
                  }

                  if (!trymatch(pop, pop + len))
                  {   debug(regexp) printf("\tdoesn't match subexpression\n");
                      break;
                  }

                  // If source is not consumed, don't
                  // infinite loop on the match
                  if (s1 == src)
                  {   debug(regexp) printf("\tsource is not consumed\n");
                      break;
                  }
                }
            }
            else  // maximal munch
            {
                for (; count < m; count++)
                {   int s1;
                  int s2;

                  memcpy(psave, pmatch.ptr, (re_nsub + 1) * regmatch_t.sizeof);
                  s1 = src;
                  if (!trymatch(pop, pop + len))
                  {   debug(regexp) printf("\tdoesn't match subexpression\n");
                      break;
                  }
                  s2 = src;

                  // If source is not consumed, don't
                  // infinite loop on the match
                  if (s1 == s2)
                  {   debug(regexp) printf("\tsource is not consumed\n");
                      break;
                  }

                  // If no match after consumption, but it
                  // did match before, then no match
                  if (!trymatch(pop + len, program.length))
                  {
                      src = s1;
                      if (trymatch(pop + len, program.length))
                      {
                        src = s1;         // no match
                        memcpy(pmatch.ptr, psave, (re_nsub + 1) * regmatch_t.sizeof);
                        break;
                      }
                  }
                  src = s2;
                }
            }
            debug(regexp) printf("\tREnm len=%d, n=%u, m=%u, DONE count=%d\n", len, n, m, count);
            pc = pop + len;
            break;

          case REparen:
            // len, ()
            debug(regexp) printf("\tREparen\n");
            puint = cast(uint *)&program[pc + 1];
            len = puint[0];
            n = puint[1];
            pop = pc + 1 + uint.sizeof * 2;
            ss = src;
            if (!trymatch(pop, pop + len))
                goto Lnomatch;
            pmatch[n + 1].rm_so = ss;
            pmatch[n + 1].rm_eo = src;
            pc = pop + len;
            break;

          case REend:
            debug(regexp) printf("\tREend\n");
            return 1;         // successful match

          case REwordboundary:
            debug(regexp) printf("\tREwordboundary\n");
            if (src > 0 && src < input.length)
            {
                c1 = input[src - 1];
                c2 = input[src];
                if (!(
                    (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
                    (!isword(cast(rchar)c1) && isword(cast(rchar)c2))
                   )
                   )
                  goto Lnomatch;
            }
            pc++;
            break;

          case REnotwordboundary:
            debug(regexp) printf("\tREnotwordboundary\n");
            if (src == 0 || src == input.length)
                goto Lnomatch;
            c1 = input[src - 1];
            c2 = input[src];
            if (
                (isword(cast(rchar)c1) && !isword(cast(rchar)c2)) ||
                (!isword(cast(rchar)c1) && isword(cast(rchar)c2))
               )
                goto Lnomatch;
            pc++;
            break;

          case REdigit:
            debug(regexp) printf("\tREdigit\n");
            if (src == input.length)
                goto Lnomatch;
            if (!isdigit(input[src]))
                goto Lnomatch;
            src++;
            pc++;
            break;

          case REnotdigit:
            debug(regexp) printf("\tREnotdigit\n");
            if (src == input.length)
                goto Lnomatch;
            if (isdigit(input[src]))
                goto Lnomatch;
            src++;
            pc++;
            break;

          case REspace:
            debug(regexp) printf("\tREspace\n");
            if (src == input.length)
                goto Lnomatch;
            if (!isspace(input[src]))
                goto Lnomatch;
            src++;
            pc++;
            break;

          case REnotspace:
            debug(regexp) printf("\tREnotspace\n");
            if (src == input.length)
                goto Lnomatch;
            if (isspace(input[src]))
                goto Lnomatch;
            src++;
            pc++;
            break;

          case REword:
            debug(regexp) printf("\tREword\n");
            if (src == input.length)
                goto Lnomatch;
            if (!isword(input[src]))
                goto Lnomatch;
            src++;
            pc++;
            break;

          case REnotword:
            debug(regexp) printf("\tREnotword\n");
            if (src == input.length)
                goto Lnomatch;
            if (isword(input[src]))
                goto Lnomatch;
            src++;
            pc++;
            break;

          case REbackref:
          {
            n = program[pc + 1];
            debug(regexp) printf("\tREbackref %d\n", n);

            int so = pmatch[n + 1].rm_so;
            int eo = pmatch[n + 1].rm_eo;
            len = eo - so;
            if (src + len > input.length)
                goto Lnomatch;
            else if (attributes & REA.ignoreCase)
            {
                if (icmp(input[src .. src + len], input[so .. eo]))
                  goto Lnomatch;
            }
            else if (memcmp(&input[src], &input[so], len * rchar.sizeof))
                goto Lnomatch;
            src += len;
            pc += 2;
            break;
          }

          default:
            assert(0);
      }
    }

Lnomatch:
    debug(regexp) printf("\tnomatch pc=%d\n", pc);
    src = srcsave;
    return 0;
}

/* =================== Compiler ================== */

int parseRegexp()
{   uint offset;
    uint gotooffset;
    uint len1;
    uint len2;

    //printf("parseRegexp() '%.*s'\n", pattern[p .. pattern.length]);
    offset = buf.offset;
    for (;;)
    {
      assert(p <= pattern.length);
      if (p == pattern.length)
      {   buf.write(REend);
          return 1;
      }
      switch (pattern[p])
      {
          case ')':
            return 1;

          case '|':
            p++;
            gotooffset = buf.offset;
            buf.write(REgoto);
            buf.write(cast(uint)0);
            len1 = buf.offset - offset;
            buf.spread(offset, 1 + uint.sizeof);
            gotooffset += 1 + uint.sizeof;
            parseRegexp();
            len2 = buf.offset - (gotooffset + 1 + uint.sizeof);
            buf.data[offset] = REor;
            (cast(uint *)&buf.data[offset + 1])[0] = len1;
            (cast(uint *)&buf.data[gotooffset + 1])[0] = len2;
            break;

          default:
            parsePiece();
            break;
      }
    }
    assert(0);
}

int parsePiece()
{   uint offset;
    uint len;
    uint n;
    uint m;
    ubyte op;
    int plength = pattern.length;

    //printf("parsePiece() '%.*s'\n", pattern[p .. pattern.length]);
    offset = buf.offset;
    parseAtom();
    if (p == plength)
      return 1;
    switch (pattern[p])
    {
      case '*':
          // Special optimization: replace .* with REanystar
          if (buf.offset - offset == 1 &&
            buf.data[offset] == REanychar &&
            p + 1 < plength &&
            pattern[p + 1] != '?')
          {
            buf.data[offset] = REanystar;
            p++;
            break;
          }

          n = 0;
          m = inf;
          goto Lnm;

      case '+':
          n = 1;
          m = inf;
          goto Lnm;

      case '?':
          n = 0;
          m = 1;
          goto Lnm;

      case '{':   // {n} {n,} {n,m}
          p++;
          if (p == plength || !isdigit(pattern[p]))
            goto Lerr;
          n = 0;
          do
          {
            // BUG: handle overflow
            n = n * 10 + pattern[p] - '0';
            p++;
            if (p == plength)
                goto Lerr;
          } while (isdigit(pattern[p]));
          if (pattern[p] == '}')          // {n}
          { m = n;
            goto Lnm;
          }
          if (pattern[p] != ',')
            goto Lerr;
          p++;
          if (p == plength)
            goto Lerr;
          if (pattern[p] == /*{*/ '}')    // {n,}
          { m = inf;
            goto Lnm;
          }
          if (!isdigit(pattern[p]))
            goto Lerr;
          m = 0;              // {n,m}
          do
          {
            // BUG: handle overflow
            m = m * 10 + pattern[p] - '0';
            p++;
            if (p == plength)
                goto Lerr;
          } while (isdigit(pattern[p]));
          if (pattern[p] != /*{*/ '}')
            goto Lerr;
          goto Lnm;

      Lnm:
          p++;
          op = REnm;
          if (p < plength && pattern[p] == '?')
          { op = REnmq; // minimal munch version
            p++;
          }
          len = buf.offset - offset;
          buf.spread(offset, 1 + uint.sizeof * 3);
          buf.data[offset] = op;
          uint* puint = cast(uint *)&buf.data[offset + 1];
          puint[0] = len;
          puint[1] = n;
          puint[2] = m;
          break;

      default:
          break;
    }
    return 1;

Lerr:
    error("badly formed {n,m}");
    assert(0);
}

int parseAtom()
{   ubyte op;
    uint offset;
    rchar c;

    //printf("parseAtom() '%.*s'\n", pattern[p .. pattern.length]);
    if (p < pattern.length)
    {
      c = pattern[p];
      switch (c)
      {
          case '*':
          case '+':
          case '?':
            error("*+? not allowed in atom");
            p++;
            return 0;

          case '(':
            p++;
            buf.write(REparen);
            offset = buf.offset;
            buf.write(cast(uint)0);       // reserve space for length
            buf.write(re_nsub);
            re_nsub++;
            parseRegexp();
            *cast(uint *)&buf.data[offset] =
                buf.offset - (offset + uint.sizeof * 2);
            if (p == pattern.length || pattern[p] != ')')
            {
                error("')' expected");
                return 0;
            }
            p++;
            break;

          case '[':
            if (!parseRange())
                return 0;
            break;

          case '.':
            p++;
            buf.write(REanychar);
            break;

          case '^':
            p++;
            buf.write(REbol);
            break;

          case '$':
            p++;
            buf.write(REeol);
            break;

          case '\\':
            p++;
            if (p == pattern.length)
            {   error("no character past '\\'");
                return 0;
            }
            c = pattern[p];
            switch (c)
            {
                case 'b':    op = REwordboundary;      goto Lop;
                case 'B':    op = REnotwordboundary; goto Lop;
                case 'd':    op = REdigit;             goto Lop;
                case 'D':    op = REnotdigit;    goto Lop;
                case 's':    op = REspace;             goto Lop;
                case 'S':    op = REnotspace;    goto Lop;
                case 'w':    op = REword;        goto Lop;
                case 'W':    op = REnotword;     goto Lop;

                Lop:
                  buf.write(op);
                  p++;
                  break;

                case 'f':
                case 'n':
                case 'r':
                case 't':
                case 'v':
                case 'c':
                case 'x':
                case 'u':
                case '0':
                  c = cast(char)escape();
                  goto Lbyte;

                case '1': case '2': case '3':
                case '4': case '5': case '6':
                case '7': case '8': case '9':
                  c -= '1';
                  if (c < re_nsub)
                  {   buf.write(REbackref);
                      buf.write(cast(ubyte)c);
                  }
                  else
                  {   error("no matching back reference");
                      return 0;
                  }
                  p++;
                  break;

                default:
                  p++;
                  goto Lbyte;
            }
            break;

          default:
            p++;
          Lbyte:
            op = REchar;
            if (attributes & REA.ignoreCase)
            {
                if (isalpha(c))
                {
                  op = REichar;
                  c = cast(char)std.ctype.toupper(c);
                }
            }
            if (op == REchar && c <= 0xFF)
            {
                // Look ahead and see if we can make this into
                // an REstring
                int q;
                int len;

                for (q = p; q < pattern.length; ++q)
                { rchar qc = pattern[q];

                  switch (qc)
                  {
                      case '{':
                      case '*':
                      case '+':
                      case '?':
                        if (q == p)
                            goto Lchar;
                        q--;
                        break;

                      case '(':     case ')':
                      case '|':
                      case '[':     case ']':
                      case '.':     case '^':
                      case '$':     case '\\':
                      case '}':
                        break;

                      default:
                        continue;
                  }
                  break;
                }
                len = q - p;
                if (len > 0)
                {
                  debug(regexp) printf("writing string len %d, c = '%c', pattern[p] = '%c'\n", len+1, c, pattern[p]);
                  buf.reserve(5 + (1 + len) * rchar.sizeof);
                  buf.write((attributes & REA.ignoreCase) ? REistring : REstring);
                  buf.write(len + 1);
                  buf.write(c);
                  buf.write(pattern[p .. p + len]);
                  p = q;
                  break;
                }
            }
            if (c >= 0x80)
            {
                // Convert to dchar opcode
                op = (op == REchar) ? REdchar : REidchar;
                buf.write(op);
                buf.write(c);
            }
            else
            {
             Lchar:
                debug(regexp) printf("It's an REchar '%c'\n", c);
                buf.write(op);
                buf.write(cast(char)c);
            }
            break;
      }
    }
    return 1;
}

private:
class Range
{
    uint maxc;
    uint maxb;
    OutBuffer buf;
    ubyte* base;
    BitArray bits;

    this(OutBuffer buf)
    {
      this.buf = buf;
      if (buf.data.length)
          this.base = &buf.data[buf.offset];
    }

    void setbitmax(uint u)
    {   uint b;

      //printf("setbitmax(x%x), maxc = x%x\n", u, maxc);
      if (u > maxc)
      {
          maxc = u;
          b = u / 8;
          if (b >= maxb)
          { uint u2;

            u2 = base ? base - &buf.data[0] : 0;
            ++b;
            version (BigEndian)
            {
                while (b & (uint.sizeof-1))
                  ++b;
            }
            
            buf.fill0(b - maxb);
            base = &buf.data[u2];
            maxb = b;
            // %% moved array recreate out of this condition
            bits.ptr = cast(uint*)this.base;
          }
          //bits = (cast(bit*)this.base)[0 .. maxc + 1];
          bits.len = maxc + 1;
      }
    }

    void setbit2(uint u)
    {
      setbitmax(u + 1);
      //printf("setbit2 [x%02x] |= x%02x\n", u >> 3, 1 << (u & 7));
      bits[u] = 1;
    }

};

int parseRange()
{   ubyte op;
    int c;
    int c2;
    uint i;
    uint cmax;
    uint offset;

    cmax = 0x7F;
    p++;
    op = REbit;
    if (p == pattern.length)
      goto Lerr;
    if (pattern[p] == '^')
    {   p++;
      op = REnotbit;
      if (p == pattern.length)
          goto Lerr;
    }
    buf.write(op);
    offset = buf.offset;
    buf.write(cast(uint)0);         // reserve space for length
    buf.reserve(128 / 8);
    auto r = new Range(buf);
    if (op == REnotbit)
      r.setbit2(0);
    switch (pattern[p])
    {
      case ']':
      case '-':
          c = pattern[p];
          p++;
          r.setbit2(c);
          break;

      default:
          break;
    }

    enum RS { start, rliteral, dash };
    RS rs;

    rs = RS.start;
    for (;;)
    {
      if (p == pattern.length)
          goto Lerr;
      switch (pattern[p])
      {
          case ']':
            switch (rs)
            {   case RS.dash:
                  r.setbit2('-');
                case RS.rliteral:
                  r.setbit2(c);
                  break;
                case RS.start:
                  break;
                default:
                  assert(0);
            }
            p++;
            break;

          case '\\':
            p++;
            r.setbitmax(cmax);
            if (p == pattern.length)
                goto Lerr;
            switch (pattern[p])
            {
                case 'd':
                  for (i = '0'; i <= '9'; i++)
                      r.bits[i] = 1;
                  goto Lrs;

                case 'D':
                  for (i = 1; i < '0'; i++)
                      r.bits[i] = 1;
                  for (i = '9' + 1; i <= cmax; i++)
                      r.bits[i] = 1;
                  goto Lrs;

                case 's':
                  for (i = 0; i <= cmax; i++)
                      if (isspace(i))
                        r.bits[i] = 1;
                  goto Lrs;

                case 'S':
                  for (i = 1; i <= cmax; i++)
                      if (!isspace(i))
                        r.bits[i] = 1;
                  goto Lrs;

                case 'w':
                  for (i = 0; i <= cmax; i++)
                      if (isword(cast(rchar)i))
                        r.bits[i] = 1;
                  goto Lrs;

                case 'W':
                  for (i = 1; i <= cmax; i++)
                      if (!isword(cast(rchar)i))
                        r.bits[i] = 1;
                  goto Lrs;

                Lrs:
                  switch (rs)
                  {   case RS.dash:
                        r.setbit2('-');
                      case RS.rliteral:
                        r.setbit2(c);
                        break;
                      default:
                        break;
                  }
                  rs = RS.start;
                  continue;

                default:
                  break;
            }
            c2 = escape();
            goto Lrange;

          case '-':
            p++;
            if (rs == RS.start)
                goto Lrange;
            else if (rs == RS.rliteral)
                rs = RS.dash;
            else if (rs == RS.dash)
            {
                r.setbit2(c);
                r.setbit2('-');
                rs = RS.start;
            }
            continue;

          default:
            c2 = pattern[p];
            p++;
          Lrange:
            switch (rs)
            {   case RS.rliteral:
                  r.setbit2(c);
                case RS.start:
                  c = c2;
                  rs = RS.rliteral;
                  break;

                case RS.dash:
                  if (c > c2)
                  {   error("inverted range in character class");
                      return 0;
                  }
                  r.setbitmax(c2);
                  //printf("c = %x, c2 = %x\n",c,c2);
                  for (; c <= c2; c++)
                      r.bits[c] = 1;
                  rs = RS.start;
                  break;

                default:
                  assert(0);
            }
            continue;
      }
      break;
    }
    if (attributes & REA.ignoreCase)
    {
      // BUG: what about dchar?
      r.setbitmax(0x7F);
      for (c = 'a'; c <= 'z'; c++)
      {
          if (r.bits[c])
            r.bits[c + 'A' - 'a'] = 1;
          else if (r.bits[c + 'A' - 'a'])
            r.bits[c] = 1;
      }
    }
    //printf("maxc = %d, maxb = %d\n",r.maxc,r.maxb);
    (cast(ushort *)&buf.data[offset])[0] = cast(ushort)r.maxc;
    (cast(ushort *)&buf.data[offset])[1] = cast(ushort)r.maxb;
    return 1;

Lerr:
    error("invalid range");
    return 0;
}

void error(char[] msg)
{
    errors++;
    debug(regexp) printf("error: %.*s\n", cast(int) msg.length, msg.ptr);
//assert(0);
//*(char*)0=0;
    throw new RegExpException(msg);
}

// p is following the \ char
int escape()
in
{
    assert(p < pattern.length);
}
body
{   int c;
    int i;
    rchar tc;

    c = pattern[p];           // none of the cases are multibyte
    switch (c)
    {
      case 'b':    c = '\b';  break;
      case 'f':    c = '\f';  break;
      case 'n':    c = '\n';  break;
      case 'r':    c = '\r';  break;
      case 't':    c = '\t';  break;
      case 'v':    c = '\v';  break;

      // BUG: Perl does \a and \e too, should we?

      case 'c':
          ++p;
          if (p == pattern.length)
            goto Lretc;
          c = pattern[p];
          // Note: we are deliberately not allowing dchar letters
          if (!(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')))
          {
           Lcerr:
            error("letter expected following \\c");
            return 0;
          }
          c &= 0x1F;
          break;

      case '0':
      case '1':
      case '2':
      case '3':
      case '4':
      case '5':
      case '6':
      case '7':
          c -= '0';
          for (i = 0; i < 2; i++)
          {
            p++;
            if (p == pattern.length)
                goto Lretc;
            tc = pattern[p];
            if ('0' <= tc && tc <= '7')
            {   c = c * 8 + (tc - '0');
                // Treat overflow as if last
                // digit was not an octal digit
                if (c >= 0xFF)
                { c >>= 3;
                  return c;
                }
            }
            else
                return c;
          }
          break;

      case 'x':
          c = 0;
          for (i = 0; i < 2; i++)
          {
            p++;
            if (p == pattern.length)
                goto Lretc;
            tc = pattern[p];
            if ('0' <= tc && tc <= '9')
                c = c * 16 + (tc - '0');
            else if ('a' <= tc && tc <= 'f')
                c = c * 16 + (tc - 'a' + 10);
            else if ('A' <= tc && tc <= 'F')
                c = c * 16 + (tc - 'A' + 10);
            else if (i == 0)  // if no hex digits after \x
            {
                // Not a valid \xXX sequence
                return 'x';
            }
            else
                return c;
          }
          break;

      case 'u':
          c = 0;
          for (i = 0; i < 4; i++)
          {
            p++;
            if (p == pattern.length)
                goto Lretc;
            tc = pattern[p];
            if ('0' <= tc && tc <= '9')
                c = c * 16 + (tc - '0');
            else if ('a' <= tc && tc <= 'f')
                c = c * 16 + (tc - 'a' + 10);
            else if ('A' <= tc && tc <= 'F')
                c = c * 16 + (tc - 'A' + 10);
            else
            {
                // Not a valid \uXXXX sequence
                p -= i;
                return 'u';
            }
          }
          break;

      default:
          break;
    }
    p++;
Lretc:
    return c;
}

/* ==================== optimizer ======================= */

void optimize()
{   ubyte[] prog;

    debug(regexp) printf("RegExp.optimize()\n");
    prog = buf.toBytes();
    for (size_t i = 0; 1;)
    {
      //printf("\tprog[%d] = %d, %d\n", i, prog[i], REstring);
      switch (prog[i])
      {
          case REend:
          case REanychar:
          case REanystar:
          case REbackref:
          case REeol:
          case REchar:
          case REichar:
          case REdchar:
          case REidchar:
          case REstring:
          case REistring:
          case REtestbit:
          case REbit:
          case REnotbit:
          case RErange:
          case REnotrange:
          case REwordboundary:
          case REnotwordboundary:
          case REdigit:
          case REnotdigit:
          case REspace:
          case REnotspace:
          case REword:
          case REnotword:
            return;

          case REbol:
            i++;
            continue;

          case REor:
          case REnm:
          case REnmq:
          case REparen:
          case REgoto:
          {
            auto bitbuf = new OutBuffer;
            auto r = new Range(bitbuf);
            uint offset;

            offset = i;
            if (starrchars(r, prog[i .. prog.length]))
            {
                debug(regexp) printf("\tfilter built\n");
                buf.spread(offset, 1 + 4 + r.maxb);
                buf.data[offset] = REtestbit;
                (cast(ushort *)&buf.data[offset + 1])[0] = cast(ushort)r.maxc;
                (cast(ushort *)&buf.data[offset + 1])[1] = cast(ushort)r.maxb;
                i = offset + 1 + 4;
                buf.data[i .. i + r.maxb] = r.base[0 .. r.maxb];
            }
            return;
          }
          default:
            assert(0);
      }
    }
}

/////////////////////////////////////////
// OR the leading character bits into r.
// Limit the character range from 0..7F,
// trymatch() will allow through anything over maxc.
// Return 1 if success, 0 if we can't build a filter or
// if there is no point to one.

int starrchars(Range r, ubyte[] prog)
{   rchar c;
    uint maxc;
    uint maxb;
    uint len;
    uint b;
    uint n;
    uint m;
    ubyte* pop;

    //printf("RegExp.starrchars(prog = %p, progend = %p)\n", prog, progend);
    for (size_t i = 0; i < prog.length;)
    {
      switch (prog[i])
      {
          case REchar:
            c = prog[i + 1];
            if (c <= 0x7F)
                r.setbit2(c);
            return 1;

          case REichar:
            c = prog[i + 1];
            if (c <= 0x7F)
            {   r.setbit2(c);
                r.setbit2(std.ctype.tolower(cast(rchar)c));
            }
            return 1;

          case REdchar:
          case REidchar:
            return 1;

          case REanychar:
            return 0;         // no point

          case REstring:
            len = *cast(uint *)&prog[i + 1];
            assert(len);
            c = *cast(rchar *)&prog[i + 1 + uint.sizeof];
            debug(regexp) printf("\tREstring %d, '%c'\n", len, c);
            if (c <= 0x7F)
                r.setbit2(c);
            return 1;

          case REistring:
            len = *cast(uint *)&prog[i + 1];
            assert(len);
            c = *cast(rchar *)&prog[i + 1 + uint.sizeof];
            debug(regexp) printf("\tREistring %d, '%c'\n", len, c);
            if (c <= 0x7F)
            {   r.setbit2(std.ctype.toupper(cast(rchar)c));
                r.setbit2(std.ctype.tolower(cast(rchar)c));
            }
            return 1;

          case REtestbit:
          case REbit:
            maxc = (cast(ushort *)&prog[i + 1])[0];
            maxb = (cast(ushort *)&prog[i + 1])[1];
            if (maxc <= 0x7F)
                r.setbitmax(maxc);
            else
                maxb = r.maxb;
            for (b = 0; b < maxb; b++)
                r.base[b] |= prog[i + 1 + 4 + b];
            return 1;

          case REnotbit:
            maxc = (cast(ushort *)&prog[i + 1])[0];
            maxb = (cast(ushort *)&prog[i + 1])[1];
            if (maxc <= 0x7F)
                r.setbitmax(maxc);
            else
                maxb = r.maxb;
            for (b = 0; b < maxb; b++)
                r.base[b] |= ~prog[i + 1 + 4 + b];
            return 1;

          case REbol:
          case REeol:
            return 0;

          case REor:
            len = (cast(uint *)&prog[i + 1])[0];
            return starrchars(r, prog[i + 1 + uint.sizeof .. prog.length]) &&
                   starrchars(r, prog[i + 1 + uint.sizeof + len .. prog.length]);

          case REgoto:
            len = (cast(uint *)&prog[i + 1])[0];
            i += 1 + uint.sizeof + len;
            break;

          case REanystar:
            return 0;

          case REnm:
          case REnmq:
            // len, n, m, ()
            len = (cast(uint *)&prog[i + 1])[0];
            n   = (cast(uint *)&prog[i + 1])[1];
            m   = (cast(uint *)&prog[i + 1])[2];
            pop = &prog[i + 1 + uint.sizeof * 3];
            if (!starrchars(r, pop[0 .. len]))
                return 0;
            if (n)
                return 1;
            i += 1 + uint.sizeof * 3 + len;
            break;

          case REparen:
            // len, ()
            len = (cast(uint *)&prog[i + 1])[0];
            n   = (cast(uint *)&prog[i + 1])[1];
            pop = &prog[0] + i + 1 + uint.sizeof * 2;
            return starrchars(r, pop[0 .. len]);

          case REend:
            return 0;

          case REwordboundary:
          case REnotwordboundary:
            return 0;

          case REdigit:
            r.setbitmax('9');
            for (c = '0'; c <= '9'; c++)
                r.bits[c] = 1;
            return 1;

          case REnotdigit:
            r.setbitmax(0x7F);
            for (c = 0; c <= '0'; c++)
                r.bits[c] = 1;
            for (c = '9' + 1; c <= r.maxc; c++)
                r.bits[c] = 1;
            return 1;

          case REspace:
            r.setbitmax(0x7F);
            for (c = 0; c <= r.maxc; c++)
                if (isspace(c))
                  r.bits[c] = 1;
            return 1;

          case REnotspace:
            r.setbitmax(0x7F);
            for (c = 0; c <= r.maxc; c++)
                if (!isspace(c))
                  r.bits[c] = 1;
            return 1;

          case REword:
            r.setbitmax(0x7F);
            for (c = 0; c <= r.maxc; c++)
                if (isword(cast(rchar)c))
                  r.bits[c] = 1;
            return 1;

          case REnotword:
            r.setbitmax(0x7F);
            for (c = 0; c <= r.maxc; c++)
                if (!isword(cast(rchar)c))
                  r.bits[c] = 1;
            return 1;

          case REbackref:
            return 0;

          default:
            assert(0);
      }
    }
    return 1;
}

/* ==================== replace ======================= */

/***********************
 * After a match is found with test(), this function
 * will take the match results and, using the format
 * string, generate and return a new string.
 */

public rchar[] replace(rchar[] format)
{
    return replace3(format, input, pmatch[0 .. re_nsub + 1]);
}

// Static version that doesn't require a RegExp object to be created

public static rchar[] replace3(rchar[] format, rchar[] input, regmatch_t[] pmatch)
{
    rchar[] result;
    uint c2;
    int rm_so;
    int rm_eo;
    int i;

//    printf("replace3(format = '%.*s', input = '%.*s')\n", format, input);
    result.length = format.length;
    result.length = 0;
    for (size_t f = 0; f < format.length; f++)
    {
      auto c = format[f];
      L1:
      if (c != '$')
      {
          result ~= c;
          continue;
      }
      ++f;
      if (f == format.length)
      {
          result ~= '$';
          break;
      }
      c = format[f];
      switch (c)
      {
          case '&':
            rm_so = pmatch[0].rm_so;
            rm_eo = pmatch[0].rm_eo;
            goto Lstring;

          case '`':
            rm_so = 0;
            rm_eo = pmatch[0].rm_so;
            goto Lstring;

          case '\'':
            rm_so = pmatch[0].rm_eo;
            rm_eo = input.length;
            goto Lstring;

          case '0': case '1': case '2': case '3': case '4':
          case '5': case '6': case '7': case '8': case '9':
            i = c - '0';
            if (f + 1 == format.length)
            {
                if (i == 0)
                {
                  result ~= '$';
                  result ~= c;
                  continue;
                }
            }
            else
            {
                c2 = format[f + 1];
                if (c2 >= '0' && c2 <= '9')
                {   i = (c - '0') * 10 + (c2 - '0');
                  f++;
                }
                if (i == 0)
                {
                  result ~= '$';
                  result ~= c;
                  c = cast(char)c2;
                  goto L1;
                }
            }

            if (i < pmatch.length)
            {   rm_so = pmatch[i].rm_so;
                rm_eo = pmatch[i].rm_eo;
                goto Lstring;
            }
            break;

          Lstring:
            if (rm_so != rm_eo)
                result ~= input[rm_so .. rm_eo];
            break;

          default:
            result ~= '$';
            result ~= c;
            break;
      }
    }
    return result;
}

/************************************
 * Like replace(char[] format), but uses old style formatting:
            <table border=1 cellspacing=0 cellpadding=5>
            <th>Format
            <th>Description
            <tr>
            <td><b>&</b>
            <td>replace with the match
            </tr>
            <tr>
            <td><b></b><i>n</i>
            <td>replace with the <i>n</i>th parenthesized match, <i>n</i> is 1..9
            </tr>
            <tr>
            <td><b></b><i>c</i>
            <td>replace with char <i>c</i>.
            </tr>
            </table>
 */

public rchar[] replaceOld(rchar[] format)
{
    rchar[] result;

//printf("replace: this = %p so = %d, eo = %d\n", this, pmatch[0].rm_so, pmatch[0].rm_eo);
//printf("3input = '%.*s'\n", input);
    result.length = format.length;
    result.length = 0;
    for (size_t i; i < format.length; i++)
    {
      auto c = format[i];
      switch (c)
      {
          case '&':
//printf("match = '%.*s'\n", input[pmatch[0].rm_so .. pmatch[0].rm_eo]);
            result ~= input[pmatch[0].rm_so .. pmatch[0].rm_eo];
            break;

          case '\\':
            if (i + 1 < format.length)
            {
                c = format[++i];
                if (c >= '1' && c <= '9')
                {   uint j;

                  j = c - '0';
                  if (j <= re_nsub && pmatch[j].rm_so != pmatch[j].rm_eo)
                      result ~= input[pmatch[j].rm_so .. pmatch[j].rm_eo];
                  break;
                }
            }
            result ~= c;
            break;

          default:
            result ~= c;
            break;
      }
    }
    return result;
}

}

unittest
{   // Created and placed in public domain by Don Clugston

    auto m = search("aBC r s", `bc\x20r[\40]s`, "i");
    assert(m.pre=="a");
    assert(m.match(0)=="BC r s");
    auto m2 = search("7xxyxxx", `^\d([a-z]{2})\D\1`);
    assert(m2.match(0)=="7xxyxx");
    // Just check the parsing.
    auto m3 = search("dcbxx", `ca|b[\d\]\D\s\S\w-\W]`);
    auto m4 = search("xy", `[^\ca-\xFa\r\n\b\f\t\v\0123]{2,485}$`);
    auto m5 = search("xxx", `^^\r\n\b{13,}\f{4}\t\v\u02aF3a\w\W`);
    auto m6 = search("xxy", `.*y`);
    assert(m6.match(0)=="xxy");
    auto m7 = search("QWDEfGH", "(ca|b|defg)+", "i");
    assert(m7.match(0)=="DEfG");
    auto m8 = search("dcbxx", `a?\B\s\S`);
    auto m9 = search("dcbxx", `[-w]`);
    auto m10 = search("dcbsfd", `aB[c-fW]dB|\d|\D|\u012356|\w|\W|\s|\S`, "i");
    auto m11 = search("dcbsfd", `[]a-]`);
    m.replaceOld(`a&b\1c`);
    m.replace(`a$&b$'$1c`);
}

Generated by  Doxygen 1.6.0   Back to index